Clustering

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(cowplot)

## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp

library(ggpubr)

## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:cowplot':
## 
##     get_legend

library(cluster)
library(purrr)
library(dplyr)

Escalar y centrar las variables: media=0 y sd= 1

El set de datos USArrests contiene informacion sobre el numero de delitos (asaltos, asesinatos y secuestros)

#View(USArrests)
?USArrests
summary(USArrests)

##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

inseguridad = scale(USArrests, center = TRUE, scale = TRUE)
summary(inseguridad)

##      Murder           Assault           UrbanPop             Rape        
##  Min.   :-1.6044   Min.   :-1.5090   Min.   :-2.31714   Min.   :-1.4874  
##  1st Qu.:-0.8525   1st Qu.:-0.7411   1st Qu.:-0.76271   1st Qu.:-0.6574  
##  Median :-0.1235   Median :-0.1411   Median : 0.03178   Median :-0.1209  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.7949   3rd Qu.: 0.9388   3rd Qu.: 0.84354   3rd Qu.: 0.5277  
##  Max.   : 2.2069   Max.   : 1.9948   Max.   : 1.75892   Max.   : 2.6444

inseguridad = as.data.frame(inseguridad)
ciudades=rownames(inseguridad)

creamos 4 cluster en funcion a su grado de inseguridad

?kmeans
kmcluster = kmeans(inseguridad,centers=4,nstart = 50)
kmcluster

## K-means clustering with 4 clusters of sizes 8, 13, 16, 13
## 
## Cluster means:
##       Murder    Assault   UrbanPop        Rape
## 1  1.4118898  0.8743346 -0.8145211  0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001  0.5758298 -0.26165379
## 4  0.6950701  1.0394414  0.7226370  1.27693964
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              4              4              1              4 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              4              3              3              4              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              2              4              3              2 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              2              1              2              4 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              4              2              1              4 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              2              2              4              2              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              4              4              1              2              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              2              1              4              3              2 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              2              2              3 
## 
## Within cluster sum of squares by cluster:
## [1]  8.316061 11.952463 16.212213 19.922437
##  (between_SS / total_SS =  71.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Graficamos los cluster en funcion del %muertes y %asaltos

inseguridad = inseguridad %>% mutate(cluster = kmcluster$cluster)

(g1=ggplot(inseguridad, aes(x = Murder, y = Assault)) +
    geom_point(aes(color=as.factor(cluster)), size=10)+
    geom_text(aes(label = cluster), size = 5) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=4") 
)

graficamos sus 2 primeras componentes

fviz_cluster(kmcluster, inseguridad)+
  theme_minimal()

Adicionamos la etiqueta de las ciudades

rownames(inseguridad)=ciudades

fviz_cluster(kmcluster, inseguridad, show.clust.cent = T,
             ellipse.type = "euclid", star.plot = T, repel = T) +
  labs(title = "Resultados clustering K-means") +
  theme_bw()

Creamos 2 cluster k=2

kmcluster2 = kmeans(inseguridad, centers=2, nstart = 50)
inseguridad = inseguridad %>% mutate(cluster2 = kmcluster2$cluster)

(g2=ggplot(inseguridad, aes(x = Murder, y = Assault)) +
    geom_point(aes(color=as.factor(cluster2)), size=10)+
    geom_text(aes(label = cluster2), size = 5) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=2") 
)

Podemos graficar ambas al mismo tiempo (notese el cambio en las etiquetas)

plot_grid(g1,g2)

##Buscar el numero optimo de Clusters

# creamos una funcion que nos retorne la var.within para cada k
total_within = function(n_clusters, data, iter.max=1000, nstart=50){
  
  cluster_means = kmeans(data,centers = n_clusters,
                         iter.max = iter.max,
                         nstart = nstart)
  return(cluster_means$tot.withinss)
}


# Se aplica esta funci?n con para diferentes valores de k
total_withinss <- map_dbl(.x = 1:15,
                          .f = total_within,
                          data = inseguridad)
total_withinss

##  [1] 258.50000 152.98322  88.76570  56.40317  50.30498  44.47436  39.80028
##  [8]  35.22490  31.55005  27.86565  24.93027  22.67622  20.93040  19.31251
## [15]  17.89404

#graficamos la varianza total

data.frame(n_clusters = 1:15, suma_cuadrados_internos = total_withinss) %>%
  ggplot(aes(x = n_clusters, y = suma_cuadrados_internos)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = 1:15) +
  labs(title = "Suma total de cuadrados intra-cluster") +
  theme_bw()

En este punto nos damos cuenta que 4 clusters es el punto donde la varianza deja de incrementarse mucho y por eso escogemos ese punto como el número de clusters ideal

#otro metodo, usando el paquete “factoextra”

matriz_dist=get_dist(inseguridad, method = "euclidean")

fviz_nbclust(inseguridad, FUNcluster = kmeans, 
             method = "wss", k.max = 15, 
             diss = matriz_dist, nstart = 50)

library(cluster)

#inseguridad2 <- hclust(kmcluster, method = 'average')
inseguridad2 <- dist(USArrests, method = 'euclidean') #Sacamos la distancia euclidiana de los puntos
inseguridad3 <- hclust(inseguridad2, method = 'average') #Hacemos el análisis de clusters

plot(inseguridad3, cex=0.5, col="red", hang = -1, 
     main="Dendograma, Distancia Euclídea, Método completo")
rect.hclust(inseguridad3, k = 4, border = 2:10)  ##Arma grupos en la grafica