#Instalación de paquetes y librerĆas
library(ggplot2)
library(car)
## Loading required package: carData
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(simstudy)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(tidyverse)
## āā Attaching core tidyverse packages āāāāāāāāāāāāāāāāāāāāāāāā tidyverse 2.0.0 āā
## ā forcats 1.0.0 ā stringr 1.5.0
## ā lubridate 1.9.2 ā tibble 3.2.1
## ā purrr 1.0.1 ā tidyr 1.3.0
## ā readr 2.1.4
## āā Conflicts āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā tidyverse_conflicts() āā
## ā data.table::between() masks dplyr::between()
## ā dplyr::filter() masks stats::filter()
## ā data.table::first() masks dplyr::first()
## ā lubridate::hour() masks data.table::hour()
## ā lubridate::isoweek() masks data.table::isoweek()
## ā dplyr::lag() masks stats::lag()
## ā data.table::last() masks dplyr::last()
## ā lubridate::mday() masks data.table::mday()
## ā lubridate::minute() masks data.table::minute()
## ā lubridate::month() masks data.table::month()
## ā lubridate::quarter() masks data.table::quarter()
## ā dplyr::recode() masks car::recode()
## ā lubridate::second() masks data.table::second()
## ā purrr::some() masks car::some()
## ā purrr::transpose() masks data.table::transpose()
## ā lubridate::wday() masks data.table::wday()
## ā lubridate::week() masks data.table::week()
## ā lubridate::yday() masks data.table::yday()
## ā lubridate::year() masks data.table::year()
## ā¹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:data.table':
##
## first, last
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
library(corrr)
#Datos
x <- c(4, 4.5, 4, 7.5, 7, 6, 5, 5.5, 5, 6)
y <- c(4, 4.5, 4, 7.5, 7, 6, 5, 5.5, 5, 6) + 4
z <- c(5, 5.5, 4.8, 5.4, 4.7, 5.6, 5.3, 5.5, 5.2, 4.8)
datos <- data.frame(punto = rep(c("x", "y", "z"), each = 10),
respuesta = c(x,y,z),predictor = 1:10)
datos
#Visualización datos
ggplot(data = datos, aes(x = as.factor(predictor), y = respuesta,
colour = punto)) + geom_path(aes(group = punto)) +
geom_point() + labs(x = "predictor") + theme_bw() +
theme(legend.position = "bottom")
x1=matrix(c(x,y))
x2=matrix(c(x,z))
x3=matrix(c(y,z))
sum(dist(x1, method = "euclidean",diag=F,upper=F,p=2),1)
## [1] 528
sum(dist(x2, method = "euclidean",diag=F,upper=F,p=2),1)
## [1] 180.3
sum(dist(x3, method = "euclidean",diag=F,upper=F,p=2),1)
## [1] 509.3
R=matrix(c(x,y,z),ncol=3)
dcor=1-cor(R)
dist(x = rbind(x,y,z), method = "euclidean")
## x y
## y 12.64911
## z 3.75100 13.98821
dcor <- 1 - cor(x = cbind(x,y,z), method = "pearson")
dcor
## x y z
## x 0.0000000 0.0000000 0.9466303
## y 0.0000000 0.0000000 0.9466303
## z 0.9466303 0.9466303 0.0000000
#AnƔlisis exploratorio
df <- data.frame(iris)
iris.2 <- iris[,-5]
species <- iris[,5]
di=data.frame(df) %>%
mutate(Species=dplyr::recode(Species,
setosa="st",
versicolor="vs",
virginica="vg"))
pairs(di[,1:4], col = species,lower.panel = NULL)
par(xpd = TRUE)
legend(x = 0.05, y = 0.4, cex = 2,
legend=as.character(levels(species)),
fill = unique(species))
par(xpd = NA)
#MƩtodo de K-means
set.seed(20)
k.means.fit <-kmeans(di[,1:4], 3, nstart = 10)
k.means.fit
## K-means clustering with 3 clusters of sizes 50, 62, 38
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 5.901613 2.748387 4.393548 1.433871
## 3 6.850000 3.073684 5.742105 2.071053
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [75] 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
##
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
k.means.fit$centers
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 5.901613 2.748387 4.393548 1.433871
## 3 6.850000 3.073684 5.742105 2.071053
k.means.fit$ifault
## [1] 0
grupos=k.means.fit$cluster
table(di$Species,grupos) #Matriz de confusión
## grupos
## 1 2 3
## st 50 0 0
## vs 0 48 2
## vg 0 14 36
dif=data.frame(di,grupos)
dif=data.frame(dif) %>%
mutate(grupos=dplyr::recode(grupos,
"3"="st",
"2"="vs",
"1"="vg"))
table(dif$grupos,dif$Species)
##
## st vs vg
## st 0 2 36
## vg 50 0 0
## vs 0 48 14
#Combinación K-means y PCA
d2 <- scale(di[,1:4])
rownames(d2) <- di$Species
fviz_nbclust(x = d2, FUNcluster = kmeans, method = "wss", k.max = 15,
diss = get_dist(d2, method = "euclidean"), nstart = 50)
x|set.seed(123)
## logical(0)
d2f=data.frame(d2)
km_clusters <- kmeans(x = d2f, centers = 3, nstart = 50)
Las funciones del paquete factoextra emplean el nombre de las filas deldataframe que contiene los datos como identificador de las observaciones.Esto permite aƱadir labels a los grƔficos.
fviz_cluster(object = km_clusters, data = d2f, show.clust.cent = TRUE,
ellipse.type = "euclid", star.plot = TRUE, repel = TRUE,
pointsize=0.5,outlier.color="darkred") +
labs(title = "Resultados clustering K-means") +
theme_bw() + theme(legend.position = "none")
require(cluster)
pam.res <- pam(d2f, 3)
# Visualización
fviz_cluster(pam.res, geom = "point", ellipse.type = "norm",
show.clust.cent = TRUE,star.plot = TRUE)+
labs(title = "Resultados clustering K-means")+ theme_bw()
#Biplot PCA y K-Means para medir representatividad
data(iris)
# PCA
pca <- prcomp(iris[,-5], scale=TRUE)
df.pca <- pca$x
# Cluster over the three first PCA dimensions
kc <- kmeans(df.pca[,1:3], 3)
fviz_pca_biplot(pca, label="var", habillage=as.factor(kc$cluster)) +
labs(color=NULL) + ggtitle("") +
theme(text = element_text(size = 15),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
legend.key = element_rect(fill = "white"))
#K-Medioides con algoritmo PAM
fviz_nbclust(x = d2f[,1:4], FUNcluster = pam, method = "wss", k.max = 15,
diss = dist(d2, method = "manhattan"))
set.seed(123)
pam_clusters <- pam(x = d2f[,1:4], k = 3, metric = "manhattan")
pam_clusters$medoids
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## st.7 -1.0184372 0.7861738 -1.2791040 -1.3110521
## vg.16 0.7930124 -0.1315388 0.9868021 0.7880307
## vs.44 -0.2938574 -0.8198233 0.2503826 0.1320673
fviz_cluster(object = pam_clusters, data = d2f[,1:4],
ellipse.type = "t",repel = TRUE) +
theme_bw() + labs(title = "Resultados clustering PAM") +
theme(legend.position = "none")
medoids <- prcomp(d2f[,1:4])$x
medoids <- medoids[rownames(pam_clusters$medoids), c("PC1", "PC2")]
medoids <- as.data.frame(medoids)
colnames(medoids) <- c("x", "y")
# Creación del grÔfico
fviz_cluster(object = pam_clusters, data = d2f[,1:4], ellipse.type = "t",
repel = TRUE) + theme_bw() +
# Se resaltan las observaciones que actĆŗan como medoids
geom_point(data = medoids, color = "firebrick", size = 2) +
labs(title = "Resultados clustering PAM") +
theme(legend.position = "none")
#Método no jerÔrquico CLARA basado en simulación y muestreo
set.seed(555)
dt1 <- genCorData(200, mu = c(37.88,16.79, 11.74,21.4,34.04),
sigma = 1, rho = 0.5, corstr = "cs" )
#Con la función rho se generan datos aleatorios CORRELACIONADOS, y no dependientes como se generan con los paquetes bÔsicos.
dt2 <- genCorData(200, mu = c(35.94,17.90, 11.81,21.77,32.86),
sigma = 1, rho = 0.7, corstr = "cs" )
dac <- rbind(dt1,dt2)
DDA=gl(2,200,400,labels=c("DDA224","DDA231"))
dfc=data.frame(DDA,dac)
colnames(dfc) <- c("DDA","id","L", "a","b","C","h")
dfc=dfc[,-2]
head(dfc)
options(digits = 3)
dfc %>%
split(.$DDA) %>%
map(select, -c(DDA)) %>%
map(cor)
## $DDA224
## L a b C h
## L 1.000 0.468 0.415 0.535 0.417
## a 0.468 1.000 0.463 0.505 0.454
## b 0.415 0.463 1.000 0.391 0.437
## C 0.535 0.505 0.391 1.000 0.526
## h 0.417 0.454 0.437 0.526 1.000
##
## $DDA231
## L a b C h
## L 1.000 0.664 0.681 0.699 0.680
## a 0.664 1.000 0.714 0.683 0.735
## b 0.681 0.714 1.000 0.686 0.696
## C 0.699 0.683 0.686 1.000 0.707
## h 0.680 0.735 0.696 0.707 1.000
clara_clusters <- clara(x = dfc, k = 2, metric = "manhattan", stand = TRUE,
samples = 60, pamLike = TRUE)
clara_clusters$sample
## [1] 2 9 16 25 26 33 37 41 61 87 90 94 95 107 110 115 116 141 142
## [20] 146 148 150 159 161 183 193 204 222 223 228 244 249 251 253 256 257 282 284
## [39] 296 302 304 314 325 380
clara_clusters$medoids
## DDA L a b C h
## [1,] 1 37.5 17 11.7 21.2 34.4
## [2,] 2 36.1 18 11.8 21.9 32.6
clara_clusters$i.med
## [1] 94 302
clara_clusters$clustering
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [223] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [260] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [297] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [334] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [371] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
table(clara_clusters$clustering)
##
## 1 2
## 197 203
fviz_cluster(object = clara_clusters, ellipse.type = "t", geom = "point",
pointsize = 1.5) + theme_bw() +
labs(title = "Resultados clustering CLARA") +
theme(legend.position = "none")
medoides <- prcomp(dfc[,-1])$x
medoides <- medoides[rownames(pam_clusters$medoides), c("PC1", "PC2")]
medoides <- as.data.frame(medoides)
colnames(medoides) <- c("x", "y")
# Creación del grÔfico
fviz_cluster(object = clara_clusters, data = d2f[,-1], ellipse.type = "t",
repel = TRUE,pointsize = 0.5) + theme_bw() +
# Se resaltan las observaciones que actĆŗan como medoids
geom_point(data = medoides, color = "firebrick", size = 1,) +
labs(title = "Resultados clustering PAM") +
theme(legend.position = "none")
#Biplot CLARA con PCA
# PCA
pca <- prcomp(dfc[,-1], scale=TRUE)
df.pca <- pca$x
# Cluster over the three first PCA dimensions
CLA <- clara(df.pca[,1:3], k = 2, metric = "manhattan", stand = TRUE,
samples = 60, pamLike = TRUE)
fviz_pca_biplot(pca, label="var", habillage=as.factor(CLA$cluster)) +
labs(color=NULL) + ggtitle("") +
theme(text = element_text(size = 15),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
legend.key = element_rect(fill = "white"))
#Métodos de clasificación JerÔrquica
datj <- scale(dfc[,-1])
rownames(datj) <- dfc[,1]
# Matriz de distancias euclideas
mat_dist <- dist(x = datj, method = "euclidean")
# Dendrogramas con linkage complete y average
hc_euclidea_complete <- hclust(d = mat_dist, method = "complete")
hc_euclidea_average <- hclust(d = mat_dist, method = "average")
hc_euclidea_single <- hclust(d = mat_dist, method = "single")
hc_euclidea_ward.D2 <- hclust(d = mat_dist, method = "ward.D2")
hc_euclidea_median <- hclust(d = mat_dist, method = "median")
hc_euclidea_centroid <- hclust(d = mat_dist, method = "centroid")
hc_euclidea_mcquitty <- hclust(d = mat_dist, method = "mcquitty")
cor(x = mat_dist, cophenetic(hc_euclidea_complete))
## [1] 0.486
cor(x = mat_dist, cophenetic(hc_euclidea_average))
## [1] 0.618
cor(x = mat_dist, cophenetic(hc_euclidea_single))
## [1] 0.435
cor(x = mat_dist, cophenetic(hc_euclidea_ward.D2))
## [1] 0.502
cor(x = mat_dist, cophenetic(hc_euclidea_median))
## [1] 0.381
cor(x = mat_dist, cophenetic(hc_euclidea_centroid)) ## Tiene la mayor correlación
## [1] 0.538
cor(x = mat_dist, cophenetic(hc_euclidea_mcquitty))
## [1] 0.481
set.seed(101)
hc_euclidea_av <- hclust(d = dist(x = datj, method = "euclidean"),
method = "centroid")
fviz_dend(x = hc_euclidea_av, k = 2, cex = 0.5,
k_colors = c("red","green"),color_labels_by_k = T,
lwd = 0.2,type = "c",label_cols = rainbow(400),
rect_lty = "lightblue") +
geom_hline(yintercept = 3.65, linetype = "dashed") +
labs(title = "Herarchical clustering",
subtitle = "Distancia euclĆdea, Centroide, k=2")
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ā¹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1 rows containing missing values (`geom_hline()`).