clusters

library(tidyverse)
library(factoextra)
library(cowplot)
library(ggpubr)
library(cluster)
library(purrr)
library(dplyr)
library(readxl)

Escalar y centrar las variables: media=0 y sd= 1

El set de datos contiene diferentes datos sobre el ahorro personal y renta de distintos países.

Variables:

sr: La tasa de ahorro personal, que representa el porcentaje de ingreso personal disponible que se ahorra.
pop15: La proporción de la población menor de 15 años, que indica la cantidad de personas menores de 15 años como porcentaje de la población total.
pop75: La proporción de la población mayor de 75 años, que representa la cantidad de personas mayores de 75 años como porcentaje de la población total.
dpi: La renta per cápita disponible, que es el ingreso disponible promedio por persona en el país.
ddpi: El crecimiento anual de la renta per cápita disponible, que mide el cambio en la renta per cápita disponible con el tiempo.

View(LifeCycleSavings)

?LifeCycleSavings
summary(LifeCycleSavings)

##        sr             pop15           pop75            dpi         
##  Min.   : 0.600   Min.   :21.44   Min.   :0.560   Min.   :  88.94  
##  1st Qu.: 6.970   1st Qu.:26.21   1st Qu.:1.125   1st Qu.: 288.21  
##  Median :10.510   Median :32.58   Median :2.175   Median : 695.66  
##  Mean   : 9.671   Mean   :35.09   Mean   :2.293   Mean   :1106.76  
##  3rd Qu.:12.617   3rd Qu.:44.06   3rd Qu.:3.325   3rd Qu.:1795.62  
##  Max.   :21.100   Max.   :47.64   Max.   :4.700   Max.   :4001.89  
##       ddpi       
##  Min.   : 0.220  
##  1st Qu.: 2.002  
##  Median : 3.000  
##  Mean   : 3.758  
##  3rd Qu.: 4.478  
##  Max.   :16.710

jubilacion = scale(LifeCycleSavings, center = TRUE, scale = TRUE)
summary(jubilacion)

##        sr              pop15             pop75               dpi         
##  Min.   :-2.0246   Min.   :-1.4915   Min.   :-1.34261   Min.   :-1.0272  
##  1st Qu.:-0.6028   1st Qu.:-0.9697   1st Qu.:-0.90489   1st Qu.:-0.8261  
##  Median : 0.1873   Median :-0.2748   Median :-0.09142   Median :-0.4149  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.6576   3rd Qu.: 0.9807   3rd Qu.: 0.79952   3rd Qu.: 0.6952  
##  Max.   : 2.5509   Max.   : 1.3714   Max.   : 1.86478   Max.   : 2.9218  
##       ddpi        
##  Min.   :-1.2327  
##  1st Qu.:-0.6116  
##  Median :-0.2640  
##  Mean   : 0.0000  
##  3rd Qu.: 0.2508  
##  Max.   : 4.5132

jubilacion = as.data.frame(jubilacion)
países=rownames(jubilacion)

creamos 5 cluster en base a que tanto se ahorra para jubilarse bien

?kmeans
kmcluster = kmeans(jubilacion,centers=6,nstart = 50)
kmcluster

## K-means clustering with 6 clusters of sizes 15, 2, 4, 13, 10, 6
## 
## Cluster means:
##           sr      pop15      pop75        dpi       ddpi
## 1  0.4903572 -1.0075621  1.0750677  0.8142890 -0.2184535
## 2 -0.3048830  0.7993464 -0.3044691 -0.8626100  3.3842641
## 3 -0.7925619 -0.6380872  0.9157315  1.9572030 -0.5244487
## 4 -1.1967148  1.0090168 -0.9403441 -0.7953190 -0.3730750
## 5  0.5702607  0.7753072 -0.9041105 -0.7148831 -0.3357643
## 6  1.0465567 -0.8005338  0.3475958 -0.1383248  1.1356145
## 
## Clustering vector:
##      Australia        Austria        Belgium        Bolivia         Brazil 
##              1              1              1              4              5 
##         Canada          Chile          China       Colombia     Costa Rica 
##              3              4              5              4              5 
##        Denmark        Ecuador        Finland         France        Germany 
##              1              4              1              1              1 
##         Greece      Guatamala       Honduras        Iceland          India 
##              6              4              4              3              5 
##        Ireland          Italy          Japan          Korea     Luxembourg 
##              1              1              6              4              1 
##          Malta         Norway    Netherlands    New Zealand      Nicaragua 
##              6              1              6              1              4 
##         Panama       Paraguay           Peru    Philippines       Portugal 
##              4              4              5              5              6 
##   South Africa South Rhodesia          Spain         Sweden    Switzerland 
##              5              5              6              3              1 
##         Turkey        Tunisia United Kingdom  United States      Venezuela 
##              4              4              1              3              5 
##         Zambia        Jamaica        Uruguay          Libya       Malaysia 
##              5              2              1              2              4 
## 
## Within cluster sum of squares by cluster:
## [1] 15.358194  2.690976  6.316750  7.750793 12.990346  7.047887
##  (between_SS / total_SS =  78.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Hay que gráficar sr y dpi

jubilacion = jubilacion %>% mutate(cluster = kmcluster$cluster)

(g1=ggplot(jubilacion, aes(x = sr, y = dpi)) +
    geom_point(aes(color=as.factor(cluster)), size=10)+
    geom_text(aes(label = cluster), size = 5) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=5"))

graficamos sus 2 primeras componentes

fviz_cluster(kmcluster, jubilacion)+
  theme_minimal()

Adicionamos la etiqueta de los países

rownames(jubilacion)=países

fviz_cluster(kmcluster, jubilacion, show.clust.cent = T,
             ellipse.type = "euclid", star.plot = T, repel = T) +
  labs(title = "Resultados clustering K-means") +
  theme_bw()

## Too few points to calculate an ellipse

Creamos 4 cluster k=4

kmcluster2 = kmeans(jubilacion, centers=4, nstart = 50)
jubilacion = jubilacion %>% mutate(cluster2 = kmcluster2$cluster)

(g2=ggplot(jubilacion, aes(x = sr, y = dpi)) +
    geom_point(aes(color=as.factor(cluster2)), size=10)+
    geom_text(aes(label = cluster2), size = 5) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=4") )

Ambas gráficas al mismo tiempo

plot_grid(g1,g2)

##Buscar el numero óptimo de Clusters

# creamos una función que nos retorne la var.within para cada k
total_within = function(n_clusters, data, iter.max=1000, nstart=50){
  cluster_means = kmeans(data,centers = n_clusters,
                         iter.max = iter.max,
                         nstart = nstart)
  return(cluster_means$tot.withinss)}


# Se aplica esta función con para diferentes valores de k
total_withinss <- map_dbl(.x = 1:15,
                          .f = total_within,
                          data = jubilacion)
total_withinss

##  [1] 492.56000 218.82248 137.24094  97.86257  75.10152  60.26602  48.19455
##  [8]  43.03800  38.99591  34.44158  30.39949  28.39731  25.87713  23.08936
## [15]  21.82076

#graficamos la varianza total

data.frame(n_clusters = 1:15, suma_cuadrados_internos = total_withinss) %>%
  ggplot(aes(x = n_clusters, y = suma_cuadrados_internos)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = 1:15) +
  labs(title = "Suma total de cuadrados intra-cluster") +
  theme_bw()

De acuerdo con la gráfica, el mejor número de clusters es 4

clusters

abraham castañon alfaro - A01747966

2023-10-03