library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
df=data.frame(iris)
iris.2 <- iris[,-5]
species <- iris[,5]
di=data.frame(df) %>% 
  mutate(Species=dplyr::recode(Species,
                               setosa="st",
                               versicolor="vs",
                               virginica="vg"))  
pairs(di[,1:4], col = species,lower.panel = NULL)
par(xpd = TRUE)
legend(x = 0.05, y = 0.4, cex = 2,
       legend=as.character(levels(species)),
       fill = unique(species))

par(xpd = NA)
set.seed(20)
k.means.fit <- kmeans(df[,1:4], 3)
k.means.fit$centers
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.006000    3.428000     1.462000    0.246000
## 2     5.901613    2.748387     4.393548    1.433871
## 3     6.850000    3.073684     5.742105    2.071053
k.means.fit$ifault
## [1] 0
grupos=k.means.fit$cluster
table(di$Species,grupos) #matriz de confusión (puedo hacerlo porque conozco las variedades)
##     grupos
##       1  2  3
##   st 50  0  0
##   vs  0 48  2
##   vg  0 14 36
dif=data.frame(di,grupos)
dif=data.frame(dif) %>% 
  mutate(grupos=dplyr::recode(grupos,
                              "3"="st",
                              "2"="vs",
                              "1"="vg")) 
table(dif$grupos,dif$Species)
##     
##      st vs vg
##   st  0  2 36
##   vg 50  0  0
##   vs  0 48 14
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
d2 <- scale(di[,1:4])
rownames(d2) <- di$Species
#www:for total within sum of square
fviz_nbclust(x = d2, FUNcluster = kmeans, method = "wss", k.max = 15, 
             diss = get_dist(d2, method = "euclidean"), nstart = 50)

set.seed(123)
d2f=data.frame(d2)
km_clusters <- kmeans(x = d2f, centers = 3, nstart = 50)

# Las funciones del paquete factoextra emplean el nombre de las filas del
# dataframe que contiene los datos como identificador de las observaciones.
# Esto permite añadir labels a los gráficos.
fviz_cluster(object = km_clusters, data = d2f, show.clust.cent = TRUE,
             ellipse.type = "euclid", star.plot = TRUE, repel = TRUE,
             pointsize=0.5,outlier.color="darkred") +
  labs(title = "Resultados clustering K-means") +
  theme_bw() +  theme(legend.position = "none")

require(cluster)
## Loading required package: cluster
pam.res <- pam(d2f, 3)
# Visualización
fviz_cluster(pam.res, geom = "point", ellipse.type = "norm",
             show.clust.cent = TRUE,star.plot = TRUE)+
  labs(title = "Resultados clustering K-means")+ theme_bw()

# Load iris dataset
data(iris)
# PCA
pca <- prcomp(iris[,-5], scale=TRUE)
df.pca <- pca$x
# Cluster over the three first PCA dimensions
kc <- kmeans(df.pca[,1:3], 4)
fviz_pca_biplot(pca, label="var", habillage=as.factor(kc$cluster)) +
  labs(color=NULL) + ggtitle("") +
  theme(text = element_text(size = 15),
        panel.background = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black"), 
        legend.key = element_rect(fill = "white"))

library(cluster)
fviz_nbclust(x = d2f[,1:4], FUNcluster = pam, method = "wss", k.max = 15,
             diss = dist(d2, method = "manhattan"))

set.seed(123)
pam_clusters <- pam(x = d2f[,1:4], k = 4, metric = "manhattan")
pam_clusters
## Medoids:
##        ID Sepal.Length Sepal.Width Petal.Length Petal.Width
## st.7    8   -1.0184372   0.7861738  -1.27910398  -1.3110521
## vg.12 113    1.1553023  -0.1315388   0.98680210   1.1816087
## vs.28  79    0.1891958  -0.3609670   0.42032558   0.3944526
## vs.19  70   -0.2938574  -1.2786796   0.08043967  -0.1303181
## Clustering vector:
##    st  st.1  st.2  st.3  st.4  st.5  st.6  st.7  st.8  st.9 st.10 st.11 st.12 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
## st.13 st.14 st.15 st.16 st.17 st.18 st.19 st.20 st.21 st.22 st.23 st.24 st.25 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
## st.26 st.27 st.28 st.29 st.30 st.31 st.32 st.33 st.34 st.35 st.36 st.37 st.38 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
## st.39 st.40 st.41 st.42 st.43 st.44 st.45 st.46 st.47 st.48 st.49    vs  vs.1 
##     1     1     1     1     1     1     1     1     1     1     1     2     3 
##  vs.2  vs.3  vs.4  vs.5  vs.6  vs.7  vs.8  vs.9 vs.10 vs.11 vs.12 vs.13 vs.14 
##     2     4     3     3     3     4     3     4     4     3     4     3     3 
## vs.15 vs.16 vs.17 vs.18 vs.19 vs.20 vs.21 vs.22 vs.23 vs.24 vs.25 vs.26 vs.27 
##     3     3     4     3     4     3     3     3     3     3     3     3     2 
## vs.28 vs.29 vs.30 vs.31 vs.32 vs.33 vs.34 vs.35 vs.36 vs.37 vs.38 vs.39 vs.40 
##     3     4     4     4     4     3     3     3     3     4     3     4     4 
## vs.41 vs.42 vs.43 vs.44 vs.45 vs.46 vs.47 vs.48 vs.49    vg  vg.1  vg.2  vg.3 
##     3     4     4     4     3     3     3     4     3     2     3     2     2 
##  vg.4  vg.5  vg.6  vg.7  vg.8  vg.9 vg.10 vg.11 vg.12 vg.13 vg.14 vg.15 vg.16 
##     2     2     4     2     2     2     2     2     2     4     3     2     2 
## vg.17 vg.18 vg.19 vg.20 vg.21 vg.22 vg.23 vg.24 vg.25 vg.26 vg.27 vg.28 vg.29 
##     2     2     3     2     3     2     3     2     2     3     3     2     2 
## vg.30 vg.31 vg.32 vg.33 vg.34 vg.35 vg.36 vg.37 vg.38 vg.39 vg.40 vg.41 vg.42 
##     2     2     2     3     3     2     2     2     3     2     2     2     3 
## vg.43 vg.44 vg.45 vg.46 vg.47 vg.48 vg.49 
##     2     2     2     3     2     2     3 
## Objective function:
##    build     swap 
## 1.316741 1.220666 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"
pam_clusters$medoids
##       Sepal.Length Sepal.Width Petal.Length Petal.Width
## st.7    -1.0184372   0.7861738  -1.27910398  -1.3110521
## vg.12    1.1553023  -0.1315388   0.98680210   1.1816087
## vs.28    0.1891958  -0.3609670   0.42032558   0.3944526
## vs.19   -0.2938574  -1.2786796   0.08043967  -0.1303181
fviz_cluster(object = pam_clusters, data = d2f[,1:4], 
             ellipse.type = "t",repel = TRUE) +
  theme_bw() +   labs(title = "Resultados clustering PAM") +
  theme(legend.position = "none")

medoids <- prcomp(d2f[,1:4])$x
medoids <- medoids[rownames(pam_clusters$medoids), c("PC1", "PC2")]
medoids <- as.data.frame(medoids)
colnames(medoids) <- c("x", "y")
# Creación del gráfico
fviz_cluster(object = pam_clusters, data = d2f[,1:4], ellipse.type = "t",
             repel = TRUE) +  theme_bw() +
  # Se resaltan las observaciones que actúan como medoids
  geom_point(data = medoids, color = "firebrick", size = 2) +
  labs(title = "Resultados clustering PAM") +
  theme(legend.position = "none")

### método clara
library(simstudy)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
set.seed(555)
dt1 <- genCorData(200, mu = c(37.88,16.79, 11.74,21.4,34.04),
                  sigma = 1, rho = 0.5, corstr = "cs" )
dt2 <- genCorData(200, mu = c(35.94,17.90, 11.81,21.77,32.86),
                  sigma = 1, rho = 0.7, corstr = "cs" )
dac <- rbind(dt1,dt2)
DDA=gl(2,200,400,labels=c("DDA224","DDA231"))
dfc=data.frame(DDA,dac)
View(dfc)
colnames(dfc) <- c("DDA","id","L", "a","b","C","h")
dfc=dfc[,-2]
head(dfc)
##      DDA        L        a        b        C        h
## 1 DDA224 36.71324 16.59757 11.88936 20.80325 34.02769
## 2 DDA224 38.31073 15.76697 13.00676 21.46628 35.65491
## 3 DDA224 38.66201 17.00284 12.14174 22.81913 34.63488
## 4 DDA224 37.53979 17.02302 14.26060 21.62405 34.72869
## 5 DDA224 38.97227 15.98841 12.11759 20.35450 34.15341
## 6 DDA224 38.61491 17.14862 11.28327 22.67647 34.46416
dac <- rbind(dt1,dt2)
DDA=gl(2,200,400,labels=c("DDA224","DDA231"))
dfc=data.frame(DDA,dac)
colnames(dfc) <- c("DDA","id","L", "a","b","C","h")
dfc=dfc[,-2] # quito id
head(dfc)
##      DDA        L        a        b        C        h
## 1 DDA224 36.71324 16.59757 11.88936 20.80325 34.02769
## 2 DDA224 38.31073 15.76697 13.00676 21.46628 35.65491
## 3 DDA224 38.66201 17.00284 12.14174 22.81913 34.63488
## 4 DDA224 37.53979 17.02302 14.26060 21.62405 34.72869
## 5 DDA224 38.97227 15.98841 12.11759 20.35450 34.15341
## 6 DDA224 38.61491 17.14862 11.28327 22.67647 34.46416
options(digits = 3)
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v tibble  3.0.1     v purrr   0.3.3
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x data.table::between() masks dplyr::between()
## x dplyr::filter()       masks stats::filter()
## x data.table::first()   masks dplyr::first()
## x dplyr::lag()          masks stats::lag()
## x data.table::last()    masks dplyr::last()
## x car::recode()         masks dplyr::recode()
## x purrr::some()         masks car::some()
## x purrr::transpose()    masks data.table::transpose()
dfc %>%
  split(.$DDA) %>% 
  map(select,-c(DDA)) %>% 
  map(cor) 
## $DDA224
##       L     a     b     C     h
## L 1.000 0.468 0.415 0.535 0.417
## a 0.468 1.000 0.463 0.505 0.454
## b 0.415 0.463 1.000 0.391 0.437
## C 0.535 0.505 0.391 1.000 0.526
## h 0.417 0.454 0.437 0.526 1.000
## 
## $DDA231
##       L     a     b     C     h
## L 1.000 0.664 0.681 0.699 0.680
## a 0.664 1.000 0.714 0.683 0.735
## b 0.681 0.714 1.000 0.686 0.696
## C 0.699 0.683 0.686 1.000 0.707
## h 0.680 0.735 0.696 0.707 1.000
library(cluster)
library(factoextra)
clara_clusters <- clara(x = dfc, k = 2, metric = "manhattan", stand = TRUE,
                        samples = 60, pamLike = TRUE)
clara_clusters$sample
##  [1]   2   9  16  25  26  33  37  41  61  87  90  94  95 107 110 115 116 141 142
## [20] 146 148 150 159 161 183 193 204 222 223 228 244 249 251 253 256 257 282 284
## [39] 296 302 304 314 325 380
clara_clusters$medoids
##      DDA    L  a    b    C    h
## [1,]   1 37.5 17 11.7 21.2 34.4
## [2,]   2 36.1 18 11.8 21.9 32.6
clara_clusters$i.med
## [1]  94 302
clara_clusters$clustering
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [223] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [260] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [297] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [334] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [371] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
table(clara_clusters$clustering)
## 
##   1   2 
## 197 203
fviz_cluster(object = clara_clusters, ellipse.type = "t", geom = "point",
             pointsize = 1.5) +  theme_bw() +
  labs(title = "Resultados clustering CLARA") +
  theme(legend.position = "none")

medoides <- prcomp(dfc[,-1])$x
medoides <- medoides[rownames(pam_clusters$medoides), c("PC1", "PC2")]
medoides <- as.data.frame(medoides)
colnames(medoides) <- c("x", "y")
# Creación del gráfico
fviz_cluster(object = clara_clusters, data = d2f[,-1], ellipse.type = "t",
             repel = TRUE,pointsize = 0.5) +  theme_bw() +
  geom_point(data = medoides, color = "firebrick", size = 1,) +
  labs(title = "Resultados clustering PAM") +
  theme(legend.position = "none")

#Hacer para el método clara el BIplot, para saber cual de las variables quedo mejor representadas. 

pcacl <- prcomp(dfc[1:200,2:6], scale=TRUE)
df.pca1 <- pcacl$x

clara_clusterspca <- clara(x = df.pca1, k = 2, metric = "manhattan", stand = TRUE,
                           samples = 60, pamLike = TRUE)
clara_clusterspca$sample
##  [1] "4"   "8"   "12"  "14"  "19"  "27"  "33"  "41"  "43"  "44"  "53"  "55" 
## [13] "56"  "58"  "62"  "63"  "70"  "79"  "87"  "107" "115" "120" "121" "129"
## [25] "130" "135" "138" "139" "145" "149" "153" "154" "159" "165" "167" "171"
## [37] "173" "174" "176" "179" "181" "183" "185" "199"
clara_clusterspca$medoids
##        PC1    PC2    PC3    PC4    PC5
## 154 -0.718  0.317 -0.213  0.172  0.413
## 165  0.626 -0.224 -0.058 -0.161 -0.429
clara_clusterspca$i.med
## [1] 154 165
clara_clusterspca$clustering
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   2   2   1   2   2   2   2   2   2   2   2   2   1   2   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   2   1   1   2   2   2   1   2   2   2   1   2   1   1   1   1   2   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   2   2   2   1   2   2   1   1   2   2   2   1   1   1   1   2   1   1   1   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   1   2   2   1   1   1   1   2   2   2   1   1   2   2   2   1   2   2   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   2   1   2   2   2   1   2   2   1   2   2   1   2   1   2   2   2   1   1   1 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   2   1   2   2   1   2   1   1   2   2   2   2   1   1   2   2   1   1   1   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   2   2   2   2   2   1   1   1   1   1   1   1   2   2   2   1   2   2   2   1 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   1   2   2   2   1   1   1   2   1   1   1   1   1   1   2   2   1   2   2   2 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   2   2   2   2   2   2   2   2   2   2   2   1   2   2   1   2   2   1   2   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   2   2   2   1   1   1   1   2   1   1   1   1   1   2   2   1   2   1   2   1
table(clara_clusters$clustering)
## 
##   1   2 
## 197 203
fviz_pca_biplot(pcacl, label="var", habillage=as.factor(clara_clusterspca$clustering)) +
  labs(color=NULL) + ggtitle("") +
  theme(text = element_text(size = 15),
        panel.background = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black"),
        legend.key = element_rect(fill = "white"))

#segundo muestreo
pcacl2 <- prcomp(dfc[201:400,2:6], scale=TRUE)
df.pca2 <- pcacl$x

clara_clusterspca2 <- clara(x = df.pca2, k = 2, metric = "manhattan", stand = TRUE,
                            samples = 60, pamLike = TRUE)
clara_clusterspca2$sample
##  [1] "4"   "8"   "12"  "14"  "19"  "27"  "33"  "41"  "43"  "44"  "53"  "55" 
## [13] "56"  "58"  "62"  "63"  "70"  "79"  "87"  "107" "115" "120" "121" "129"
## [25] "130" "135" "138" "139" "145" "149" "153" "154" "159" "165" "167" "171"
## [37] "173" "174" "176" "179" "181" "183" "185" "199"
clara_clusterspca2$medoids
##        PC1    PC2    PC3    PC4    PC5
## 154 -0.718  0.317 -0.213  0.172  0.413
## 165  0.626 -0.224 -0.058 -0.161 -0.429
clara_clusterspca2$i.med
## [1] 154 165
clara_clusterspca2$clustering
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   2   2   1   2   2   2   2   2   2   2   2   2   1   2   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   2   1   1   2   2   2   1   2   2   2   1   2   1   1   1   1   2   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   2   2   2   1   2   2   1   1   2   2   2   1   1   1   1   2   1   1   1   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   1   2   2   1   1   1   1   2   2   2   1   1   2   2   2   1   2   2   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   2   1   2   2   2   1   2   2   1   2   2   1   2   1   2   2   2   1   1   1 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   2   1   2   2   1   2   1   1   2   2   2   2   1   1   2   2   1   1   1   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   2   2   2   2   2   1   1   1   1   1   1   1   2   2   2   1   2   2   2   1 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   1   2   2   2   1   1   1   2   1   1   1   1   1   1   2   2   1   2   2   2 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   2   2   2   2   2   2   2   2   2   2   2   1   2   2   1   2   2   1   2   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   2   2   2   1   1   1   1   2   1   1   1   1   1   2   2   1   2   1   2   1
table(clara_clusterspca2$clustering)
## 
##   1   2 
##  94 106
fviz_pca_biplot(pcacl2, label="var", habillage=as.factor(clara_clusterspca2$clustering)) +
  labs(color=NULL) + ggtitle("") +
  theme(text = element_text(size = 15),
        panel.background = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black"),
        legend.key = element_rect(fill = "white"))