This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

# Instalar librerias 
library(tidyverse)
── Attaching core tidyverse packages ─────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cowplot)

Attaching package: ‘cowplot’

The following object is masked from ‘package:lubridate’:

    stamp
library(ggpubr)

Attaching package: ‘ggpubr’

The following object is masked from ‘package:cowplot’:

    get_legend
library(cluster)
library(purrr)
library(dplyr)

La base de datos habla acerca de características que comparten los clientes de cierta empresa. Esta base de datos puede ser útil para identificar las necesidades insatisfechas de los clientes.

#Base de datos 
head(segmentation.data)

En este caso solo vamos a escoger las columnas que sean discretas y categóricas de más de dos categorías

Datos = select(segmentation.data, Age, Education, Income, Settlement.size)
Datos

Etiquetas = select(segmentation.data, ID)
# Estandarizar 

Datos_Estandarizados = scale(Datos, center = TRUE, scale = TRUE)
summary(Datos_Estandarizados)
      Age            Education            Income        Settlement.size  
 Min.   :-1.5281   Min.   :-1.73063   Min.   :-2.2337   Min.   :-0.9095  
 1st Qu.:-0.7602   1st Qu.:-0.06336   1st Qu.:-0.6112   1st Qu.:-0.9095  
 Median :-0.2482   Median :-0.06336   Median :-0.1419   Median : 0.3212  
 Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
 3rd Qu.: 0.5197   3rd Qu.:-0.06336   3rd Qu.: 0.4492   3rd Qu.: 0.3212  
 Max.   : 3.4209   Max.   : 3.27120   Max.   : 4.9440   Max.   : 1.5519  
# Escoger el número óptimo de Clústers 

clusters = function(n_clusters, data, iter.max=1000, nstart=50){
  
  cluster_medias = kmeans(data,centers = n_clusters,
                         iter.max = iter.max,
                         nstart = nstart)
  return(cluster_medias$tot.withinss)
}


# Se aplica esta funci?n con para diferentes valores de k
total_cluster <- map_dbl(.x = 1:15, # Se ejecuta 15 veces
                          .f = clusters,
                          data = Datos_Estandarizados)
total_cluster
 [1] 7996.000 5419.341 3874.219 3387.756 2984.304 2643.625 2366.884 2131.693 1918.229
[10] 1769.919 1620.143 1504.640 1395.722 1347.695 1252.196

#graficamos la varianza total

data.frame(n_clusters = 1:15, suma_cuadrados_internos = total_cluster) %>%
  ggplot(aes(x = n_clusters, y = suma_cuadrados_internos)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = 1:15) +
  labs(title = "Suma total de cuadrados intra-cluster") +
  theme_bw()

Se puede ver que una buena cantidad de Clusters serían 3, ya que a partir del cuarto van empezando a tener una diferencia muy pequeña entre cada número de clúster

clientes = as.data.frame(Datos_Estandarizados)
nombres = rownames(clientes)
kmcluster = kmeans(clientes,centers=3,nstart = 50)
kmcluster
K-means clustering with 3 clusters of sizes 896, 767, 337

Cluster means:
         Age  Education     Income Settlement.size
1 -0.3947841 -0.3052608 -0.5968776      -0.8655478
2 -0.2373184 -0.3829002  0.3710312       0.9117068
3  1.5897619  1.6830805  0.7424969       0.2262662

Clustering vector:
   [1] 3 2 1 2 3 1 3 2 3 1 1 1 2 3 2 1 1 2 3 2 2 2 1 2 2 1 1 3 2 3 2 2 2 3 2 1 2 2 2 1 2
  [42] 1 1 2 3 2 1 1 1 1 1 2 2 1 3 1 2 2 2 1 2 2 3 2 2 2 1 1 2 2 2 1 2 3 2 3 2 2 1 2 3 2
  [83] 1 3 1 2 2 3 2 2 3 3 1 2 2 3 3 2 2 2 1 1 1 2 2 2 2 2 1 1 2 2 2 1 2 2 2 2 2 2 2 3 1
 [124] 3 2 2 1 2 2 1 2 2 1 2 2 2 2 3 2 3 1 2 2 2 2 2 1 3 2 2 1 2 2 2 2 1 2 1 2 1 2 2 2 3
 [165] 2 2 2 2 2 2 1 1 2 1 2 3 3 2 2 3 2 2 1 2 2 2 3 3 1 2 3 1 2 2 2 2 1 1 2 2 1 1 2 1 1
 [206] 2 1 2 1 2 2 3 2 3 1 1 2 2 1 3 1 1 2 2 2 2 2 3 1 2 2 1 1 1 1 1 2 3 2 2 1 2 2 1 1 2
 [247] 2 1 2 2 3 3 2 2 2 1 1 2 2 3 2 3 2 3 1 1 1 1 3 1 2 2 2 2 1 2 1 3 2 2 2 2 1 3 2 2 2
 [288] 2 3 3 1 2 3 3 3 2 2 1 2 2 3 3 1 1 3 2 1 1 2 1 1 1 2 1 1 2 1 3 2 1 1 1 1 1 2 1 2 2
 [329] 2 2 3 1 2 1 2 3 1 2 2 1 2 1 2 2 1 1 2 1 1 2 1 2 2 2 2 1 2 2 1 2 3 1 2 2 1 1 2 2 2
 [370] 3 2 2 1 3 3 2 1 2 3 3 2 2 1 2 1 1 1 2 2 2 2 1 2 2 2 2 3 3 3 3 2 2 2 1 1 1 2 2 1 2
 [411] 2 2 1 1 2 2 1 2 2 2 2 2 1 2 2 2 1 2 1 3 3 2 2 1 2 1 1 2 3 1 2 2 2 1 2 1 2 1 3 3 2
 [452] 1 2 2 2 2 2 1 2 2 2 2 1 2 2 3 1 2 2 1 1 2 1 2 1 2 1 2 1 2 1 1 3 1 2 2 2 3 1 2 2 3
 [493] 1 1 1 2 2 1 1 1 2 2 3 1 1 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 3 2 1 2 1 1 2 1 1 2 1
 [534] 2 2 2 3 2 2 3 2 3 1 2 3 2 2 2 1 2 1 1 1 2 3 1 2 2 1 1 2 1 1 2 2 1 1 2 1 2 1 1 1 1
 [575] 1 2 1 2 2 1 2 1 2 1 3 2 1 1 3 3 3 3 1 1 2 1 1 1 1 2 3 2 1 2 2 2 3 3 2 1 2 3 2 2 1
 [616] 3 2 1 1 3 2 2 2 2 3 2 2 2 2 3 2 2 2 1 1 1 2 2 2 1 1 1 2 2 2 2 2 2 2 1 2 1 2 2 1 2
 [657] 1 2 2 2 2 1 2 2 3 2 2 2 1 1 1 2 2 2 2 2 2 1 3 2 3 2 2 3 1 2 2 2 2 2 2 1 2 1 2 1 1
 [698] 1 2 2 1 1 3 2 2 2 2 2 2 1 2 2 2 1 2 3 2 2 2 2 2 2 1 3 2 3 3 2 3 2 2 1 3 2 1 1 2 1
 [739] 2 1 2 1 2 2 2 1 2 1 1 2 3 1 1 2 3 2 3 2 1 1 3 2 1 2 2 1 1 1 2 3 1 2 2 2 3 1 2 2 2
 [780] 3 2 3 1 1 3 1 1 2 2 2 3 2 1 3 2 1 3 1 3 1 2 3 1 1 1 2 1 3 3 1 2 1 2 2 2 3 3 2 3 2
 [821] 1 2 1 1 2 1 1 2 2 2 2 2 2 3 1 2 1 1 3 3 1 1 2 1 2 1 3 1 3 3 2 1 1 1 2 2 3 1 2 2 2
 [862] 1 2 1 1 2 2 2 2 1 3 2 2 2 2 3 2 2 2 1 2 3 2 3 3 3 2 2 2 1 2 2 1 2 2 2 2 1 1 2 2 3
 [903] 2 2 2 2 1 1 3 1 2 1 2 2 1 2 1 3 2 2 2 2 2 2 3 2 2 2 3 1 1 1 2 2 1 2 1 1 2 2 2 1 2
 [944] 1 3 2 2 2 1 2 1 2 1 2 2 3 1 1 1 2 2 3 2 1 1 2 1 3 2 1 2 1 1 2 1 3 3 2 1 2 2 2 2 2
 [985] 1 2 2 3 2 1 1 1 1 2 2 1 2 2 1 2
 [ reached getOption("max.print") -- omitted 1000 entries ]

Within cluster sum of squares by cluster:
[1] 1144.149 1501.911 1228.159
 (between_SS / total_SS =  51.5 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      
clientes = clientes %>% mutate(cluster = kmcluster$cluster)

(g1=ggplot(clientes, aes(x = Age, y = Income)) +
    geom_point(aes(color=as.factor(cluster)), size=5)+
    geom_text(aes(label = cluster), size = 1) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=3") 
)

Al graficar a los Clusters considerando solo las variables de edad e ingreso, se puede ver que en sí no hay una división notoria a simple vista de cada uno de los clusters, sin embargo, este algortimo nos ayudó a saber que hay tres tipos de clientes principales.

fviz_cluster(kmcluster, clientes)+
  theme_minimal()

A pesar de que no es muy fácil distinguir cuáles son los clientes que pertenecen a cada categoría, se puede ver que se dividen en tres categorías

Con esto, se pueden sacar las características escenciales de cada clúster, comprender a los clientes y saber quiénes son los clientes objetivo de determinada promoción o producto para la empresa.

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVGhpcyBpcyBhbiBbUiBNYXJrZG93bl0oaHR0cDovL3JtYXJrZG93bi5yc3R1ZGlvLmNvbSkgTm90ZWJvb2suIFdoZW4geW91IGV4ZWN1dGUgY29kZSB3aXRoaW4gdGhlIG5vdGVib29rLCB0aGUgcmVzdWx0cyBhcHBlYXIgYmVuZWF0aCB0aGUgY29kZS4KClRyeSBleGVjdXRpbmcgdGhpcyBjaHVuayBieSBjbGlja2luZyB0aGUgKlJ1biogYnV0dG9uIHdpdGhpbiB0aGUgY2h1bmsgb3IgYnkgcGxhY2luZyB5b3VyIGN1cnNvciBpbnNpZGUgaXQgYW5kIHByZXNzaW5nICpDbWQrU2hpZnQrRW50ZXIqLgoKYGBge3J9CiMgSW5zdGFsYXIgbGlicmVyaWFzIApsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShmYWN0b2V4dHJhKQpsaWJyYXJ5KGNvd3Bsb3QpCmxpYnJhcnkoZ2dwdWJyKQpsaWJyYXJ5KGNsdXN0ZXIpCmxpYnJhcnkocHVycnIpCmxpYnJhcnkoZHBseXIpCmBgYAoKTGEgYmFzZSBkZSBkYXRvcyBoYWJsYSBhY2VyY2EgZGUgY2FyYWN0ZXLDrXN0aWNhcyBxdWUgY29tcGFydGVuIGxvcyBjbGllbnRlcyBkZSBjaWVydGEgZW1wcmVzYS4gRXN0YSBiYXNlIGRlIGRhdG9zIHB1ZWRlIHNlciDDunRpbCBwYXJhIGlkZW50aWZpY2FyIGxhcyBuZWNlc2lkYWRlcyBpbnNhdGlzZmVjaGFzIGRlIGxvcyBjbGllbnRlcy4KCmBgYHtyfQojQmFzZSBkZSBkYXRvcyAKaGVhZChzZWdtZW50YXRpb24uZGF0YSkKYGBgCgpFbiBlc3RlIGNhc28gc29sbyB2YW1vcyBhIGVzY29nZXIgbGFzIGNvbHVtbmFzIHF1ZSBzZWFuIGRpc2NyZXRhcyB5IGNhdGVnw7NyaWNhcyBkZSBtw6FzIGRlIGRvcyBjYXRlZ29yw61hcwoKYGBge3J9CkRhdG9zID0gc2VsZWN0KHNlZ21lbnRhdGlvbi5kYXRhLCBBZ2UsIEVkdWNhdGlvbiwgSW5jb21lLCBTZXR0bGVtZW50LnNpemUpCkRhdG9zCgpFdGlxdWV0YXMgPSBzZWxlY3Qoc2VnbWVudGF0aW9uLmRhdGEsIElEKQpgYGAKCmBgYHtyfQojIEVzdGFuZGFyaXphciAKCkRhdG9zX0VzdGFuZGFyaXphZG9zID0gc2NhbGUoRGF0b3MsIGNlbnRlciA9IFRSVUUsIHNjYWxlID0gVFJVRSkKc3VtbWFyeShEYXRvc19Fc3RhbmRhcml6YWRvcykKCmBgYAoKYGBge3J9CiMgRXNjb2dlciBlbCBuw7ptZXJvIMOzcHRpbW8gZGUgQ2zDunN0ZXJzIAoKY2x1c3RlcnMgPSBmdW5jdGlvbihuX2NsdXN0ZXJzLCBkYXRhLCBpdGVyLm1heD0xMDAwLCBuc3RhcnQ9NTApewogIAogIGNsdXN0ZXJfbWVkaWFzID0ga21lYW5zKGRhdGEsY2VudGVycyA9IG5fY2x1c3RlcnMsCiAgICAgICAgICAgICAgICAgICAgICAgICBpdGVyLm1heCA9IGl0ZXIubWF4LAogICAgICAgICAgICAgICAgICAgICAgICAgbnN0YXJ0ID0gbnN0YXJ0KQogIHJldHVybihjbHVzdGVyX21lZGlhcyR0b3Qud2l0aGluc3MpCn0KCgojIFNlIGFwbGljYSBlc3RhIGZ1bmNpP24gY29uIHBhcmEgZGlmZXJlbnRlcyB2YWxvcmVzIGRlIGsKdG90YWxfY2x1c3RlciA8LSBtYXBfZGJsKC54ID0gMToxNSwgIyBTZSBlamVjdXRhIDE1IHZlY2VzCiAgICAgICAgICAgICAgICAgICAgICAgICAgLmYgPSBjbHVzdGVycywKICAgICAgICAgICAgICAgICAgICAgICAgICBkYXRhID0gRGF0b3NfRXN0YW5kYXJpemFkb3MpCnRvdGFsX2NsdXN0ZXIKCmBgYAoKYGBge3J9CgojZ3JhZmljYW1vcyBsYSB2YXJpYW56YSB0b3RhbAoKZGF0YS5mcmFtZShuX2NsdXN0ZXJzID0gMToxNSwgc3VtYV9jdWFkcmFkb3NfaW50ZXJub3MgPSB0b3RhbF9jbHVzdGVyKSAlPiUKICBnZ3Bsb3QoYWVzKHggPSBuX2NsdXN0ZXJzLCB5ID0gc3VtYV9jdWFkcmFkb3NfaW50ZXJub3MpKSArCiAgZ2VvbV9saW5lKCkgKwogIGdlb21fcG9pbnQoKSArCiAgc2NhbGVfeF9jb250aW51b3VzKGJyZWFrcyA9IDE6MTUpICsKICBsYWJzKHRpdGxlID0gIlN1bWEgdG90YWwgZGUgY3VhZHJhZG9zIGludHJhLWNsdXN0ZXIiKSArCiAgdGhlbWVfYncoKQoKYGBgCgpTZSBwdWVkZSB2ZXIgcXVlIHVuYSBidWVuYSBjYW50aWRhZCBkZSBDbHVzdGVycyBzZXLDrWFuIDMsIHlhIHF1ZSBhIHBhcnRpciBkZWwgY3VhcnRvIHZhbiBlbXBlemFuZG8gYSB0ZW5lciB1bmEgZGlmZXJlbmNpYSBtdXkgcGVxdWXDsWEgZW50cmUgY2FkYSBuw7ptZXJvIGRlIGNsw7pzdGVyCgpgYGB7cn0KY2xpZW50ZXMgPSBhcy5kYXRhLmZyYW1lKERhdG9zX0VzdGFuZGFyaXphZG9zKQpub21icmVzID0gcm93bmFtZXMoY2xpZW50ZXMpCmBgYAoKYGBge3J9CmttY2x1c3RlciA9IGttZWFucyhjbGllbnRlcyxjZW50ZXJzPTMsbnN0YXJ0ID0gNTApCmttY2x1c3RlcgpgYGAKCmBgYHtyfQpjbGllbnRlcyA9IGNsaWVudGVzICU+JSBtdXRhdGUoY2x1c3RlciA9IGttY2x1c3RlciRjbHVzdGVyKQoKKGcxPWdncGxvdChjbGllbnRlcywgYWVzKHggPSBBZ2UsIHkgPSBJbmNvbWUpKSArCiAgICBnZW9tX3BvaW50KGFlcyhjb2xvcj1hcy5mYWN0b3IoY2x1c3RlcikpLCBzaXplPTUpKwogICAgZ2VvbV90ZXh0KGFlcyhsYWJlbCA9IGNsdXN0ZXIpLCBzaXplID0gMSkgKwogICAgdGhlbWVfYncoKSArCiAgICB0aGVtZShsZWdlbmQucG9zaXRpb24gPSAibm9uZSIpKwogICAgbGFicyh0aXRsZSA9ICJLbWVuYXMgY29uIGs9MyIpIAopCmBgYAoKQWwgZ3JhZmljYXIgYSBsb3MgQ2x1c3RlcnMgY29uc2lkZXJhbmRvIHNvbG8gbGFzIHZhcmlhYmxlcyBkZSBlZGFkIGUgaW5ncmVzbywgc2UgcHVlZGUgdmVyIHF1ZSBlbiBzw60gbm8gaGF5IHVuYSBkaXZpc2nDs24gbm90b3JpYSBhIHNpbXBsZSB2aXN0YSBkZSBjYWRhIHVubyBkZSBsb3MgY2x1c3RlcnMsIHNpbiBlbWJhcmdvLCBlc3RlIGFsZ29ydGltbyBub3MgYXl1ZMOzIGEgc2FiZXIgcXVlIGhheSB0cmVzIHRpcG9zIGRlIGNsaWVudGVzIHByaW5jaXBhbGVzLgoKYGBge3J9CmZ2aXpfY2x1c3RlcihrbWNsdXN0ZXIsIGNsaWVudGVzKSsKICB0aGVtZV9taW5pbWFsKCkKYGBgCgpBIHBlc2FyIGRlIHF1ZSBubyBlcyBtdXkgZsOhY2lsIGRpc3Rpbmd1aXIgY3XDoWxlcyBzb24gbG9zIGNsaWVudGVzIHF1ZSBwZXJ0ZW5lY2VuIGEgY2FkYSBjYXRlZ29yw61hLCBzZSBwdWVkZSB2ZXIgcXVlIHNlIGRpdmlkZW4gZW4gdHJlcyBjYXRlZ29yw61hcwoKQ29uIGVzdG8sIHNlIHB1ZWRlbiBzYWNhciBsYXMgY2FyYWN0ZXLDrXN0aWNhcyBlc2NlbmNpYWxlcyBkZSBjYWRhIGNsw7pzdGVyLCBjb21wcmVuZGVyIGEgbG9zIGNsaWVudGVzIHkgc2FiZXIgcXVpw6luZXMgc29uIGxvcyBjbGllbnRlcyBvYmpldGl2byBkZSBkZXRlcm1pbmFkYSBwcm9tb2Npw7NuIG8gcHJvZHVjdG8gcGFyYSBsYSBlbXByZXNhLgo=