library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:cowplot':
##
## get_legend
library(cluster)
library(purrr)
library(dplyr)
Escalar y centrar las variables: media=0 y sd= 1
El set de datos USArrests contiene informacion sobre el numero de delitos (asaltos, asesinatos y secuestros)
#View(USArrests)
?USArrests
summary(USArrests)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
N
inseguridad = scale(USArrests, center = TRUE, scale = TRUE)
summary(inseguridad)
## Murder Assault UrbanPop Rape
## Min. :-1.6044 Min. :-1.5090 Min. :-2.31714 Min. :-1.4874
## 1st Qu.:-0.8525 1st Qu.:-0.7411 1st Qu.:-0.76271 1st Qu.:-0.6574
## Median :-0.1235 Median :-0.1411 Median : 0.03178 Median :-0.1209
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.7949 3rd Qu.: 0.9388 3rd Qu.: 0.84354 3rd Qu.: 0.5277
## Max. : 2.2069 Max. : 1.9948 Max. : 1.75892 Max. : 2.6444
inseguridad = as.data.frame(inseguridad)
ciudades=rownames(inseguridad)
creamos 4 cluster en funcion a su grado de inseguridad
?kmeans
kmcluster = kmeans(inseguridad,centers=4,nstart = 50)
kmcluster
## K-means clustering with 4 clusters of sizes 8, 13, 16, 13
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 1.4118898 0.8743346 -0.8145211 0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 4 0.6950701 1.0394414 0.7226370 1.27693964
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 4 4 1 4
## Colorado Connecticut Delaware Florida Georgia
## 4 3 3 4 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 2 4 3 2
## Kansas Kentucky Louisiana Maine Maryland
## 3 2 1 2 4
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 4 2 1 4
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 4 2 3
## New Mexico New York North Carolina North Dakota Ohio
## 4 4 1 2 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 4 3 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 8.316061 11.952463 16.212213 19.922437
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Graficamos los cluster en funcion del %muertes y %asaltos
inseguridad = inseguridad %>% mutate(cluster = kmcluster$cluster)
(g1=ggplot(inseguridad, aes(x = Murder, y = Assault)) +
geom_point(aes(color=as.factor(cluster)), size=10)+
geom_text(aes(label = cluster), size = 5) +
theme_bw() +
theme(legend.position = "none")+
labs(title = "Kmenas con k=4")
)
graficamos sus 2 primeras componentes
fviz_cluster(kmcluster, inseguridad)+
theme_minimal()
Adicionamos la etiqueta de las ciudades
rownames(inseguridad)=ciudades
fviz_cluster(kmcluster, inseguridad, show.clust.cent = T,
ellipse.type = "euclid", star.plot = T, repel = T) +
labs(title = "Resultados clustering K-means") +
theme_bw()
Creamos 2 cluster k=2
kmcluster2 = kmeans(inseguridad, centers=2, nstart = 50)
inseguridad = inseguridad %>% mutate(cluster2 = kmcluster2$cluster)
(g2=ggplot(inseguridad, aes(x = Murder, y = Assault)) +
geom_point(aes(color=as.factor(cluster2)), size=10)+
geom_text(aes(label = cluster2), size = 5) +
theme_bw() +
theme(legend.position = "none")+
labs(title = "Kmenas con k=2")
)
Podemos graficar ambas al mismo tiempo (notese el cambio en las
etiquetas)
plot_grid(g1,g2)
##Buscar el numero optimo de Clusters
# creamos una funcion que nos retorne la var.within para cada k
total_within = function(n_clusters, data, iter.max=1000, nstart=50){
cluster_means = kmeans(data,centers = n_clusters,
iter.max = iter.max,
nstart = nstart)
return(cluster_means$tot.withinss)
}
# Se aplica esta funci?n con para diferentes valores de k
total_withinss <- map_dbl(.x = 1:15,
.f = total_within,
data = inseguridad)
total_withinss
## [1] 258.50000 152.98322 88.76570 56.40317 50.30498 44.47436 39.80028
## [8] 35.22490 31.55005 27.86565 24.93027 22.67622 20.93040 19.31251
## [15] 17.89404
#graficamos la varianza total
data.frame(n_clusters = 1:15, suma_cuadrados_internos = total_withinss) %>%
ggplot(aes(x = n_clusters, y = suma_cuadrados_internos)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:15) +
labs(title = "Suma total de cuadrados intra-cluster") +
theme_bw()
En este punto nos damos cuenta que 4 clusters es el punto donde la varianza deja de incrementarse mucho y por eso escogemos ese punto como el número de clusters ideal
#otro metodo, usando el paquete “factoextra”
matriz_dist=get_dist(inseguridad, method = "euclidean")
fviz_nbclust(inseguridad, FUNcluster = kmeans,
method = "wss", k.max = 15,
diss = matriz_dist, nstart = 50)
library(cluster)
#inseguridad2 <- hclust(kmcluster, method = 'average')
inseguridad2 <- dist(USArrests, method = 'euclidean') #Sacamos la distancia euclidiana de los puntos
inseguridad3 <- hclust(inseguridad2, method = 'average') #Hacemos el análisis de clusters
plot(inseguridad3, cex=0.5, col="red", hang = -1,
main="Dendograma, Distancia Euclídea, Método completo")
rect.hclust(inseguridad3, k = 4, border = 2:10) ##Arma grupos en la grafica