Instalar paquetes y llamar librerias

library(cluster)
library(ggplot2)
library(factoextra)
library(data.table)
library(dplyr)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(devtools)
library(fastDummies)

Ejemplo vinos

Contexto

Estos datos son el resultado de un análisis químico de vinos cultivados en la misma región de Italia pero derivados de tres cultivares diferentes.

El análisis determinó las cantidades de 13 componentes que se encuentran en cada uno de los tres cultivares.

Importar la base de datos

data <- read.csv('C:\\Users\\ACER\\Downloads\\wine_dataset.csv')

Entender la base de datos

summary(data)
##     alcohol        malic_acid         ash        alcalinity_of_ash
##  Min.   :11.03   Min.   :0.740   Min.   :1.360   Min.   :10.60    
##  1st Qu.:12.36   1st Qu.:1.603   1st Qu.:2.210   1st Qu.:17.20    
##  Median :13.05   Median :1.865   Median :2.360   Median :19.50    
##  Mean   :13.00   Mean   :2.336   Mean   :2.367   Mean   :19.49    
##  3rd Qu.:13.68   3rd Qu.:3.083   3rd Qu.:2.558   3rd Qu.:21.50    
##  Max.   :14.83   Max.   :5.800   Max.   :3.230   Max.   :30.00    
##    magnesium      total_phenols     flavanoids    nonflavanoid_phenols
##  Min.   : 70.00   Min.   :0.980   Min.   :0.340   Min.   :0.1300      
##  1st Qu.: 88.00   1st Qu.:1.742   1st Qu.:1.205   1st Qu.:0.2700      
##  Median : 98.00   Median :2.355   Median :2.135   Median :0.3400      
##  Mean   : 99.74   Mean   :2.295   Mean   :2.029   Mean   :0.3619      
##  3rd Qu.:107.00   3rd Qu.:2.800   3rd Qu.:2.875   3rd Qu.:0.4375      
##  Max.   :162.00   Max.   :3.880   Max.   :5.080   Max.   :0.6600      
##  proanthocyanins color_intensity       hue         od280.od315_of_diluted_wines
##  Min.   :0.410   Min.   : 1.280   Min.   :0.4800   Min.   :1.270               
##  1st Qu.:1.250   1st Qu.: 3.220   1st Qu.:0.7825   1st Qu.:1.938               
##  Median :1.555   Median : 4.690   Median :0.9650   Median :2.780               
##  Mean   :1.591   Mean   : 5.058   Mean   :0.9574   Mean   :2.612               
##  3rd Qu.:1.950   3rd Qu.: 6.200   3rd Qu.:1.1200   3rd Qu.:3.170               
##  Max.   :3.580   Max.   :13.000   Max.   :1.7100   Max.   :4.000               
##     proline           target      
##  Min.   : 278.0   Min.   :0.0000  
##  1st Qu.: 500.5   1st Qu.:0.0000  
##  Median : 673.5   Median :1.0000  
##  Mean   : 746.9   Mean   :0.9382  
##  3rd Qu.: 985.0   3rd Qu.:2.0000  
##  Max.   :1680.0   Max.   :2.0000

Escalar la base de datos

scaled_data <- subset(data, select = -target)
scaled_data <- scale(scaled_data)

Generar los clusters

clusters <- 3
kmean <- kmeans(scaled_data, clusters)

Asignar los clusters a los datos

labeled_data <- cbind(data, cluster = kmean$cluster)

Gráficar los clusters

fviz_cluster(kmean, data = data)

Optimizar los clusters

# The optimal quantity of clusters corresponds to the first highest point on the chart
set.seed(123)
optimal <- clusGap(scaled_data, FUN = kmeans, nstart = 1, K.max = 10)
plot(optimal, xlab = 'Number of clusters')

Comparar clusters

cluster_mean <- aggregate(labeled_data, by = list(labeled_data$cluster), FUN = mean)
cluster_mean
##   Group.1  alcohol malic_acid      ash alcalinity_of_ash magnesium
## 1       1 12.25092   1.897385 2.231231          20.06308  92.73846
## 2       2 13.67677   1.997903 2.466290          17.46290 107.96774
## 3       3 13.13412   3.307255 2.417647          21.24118  98.66667
##   total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity
## 1      2.247692  2.0500000            0.3576923        1.624154        2.973077
## 2      2.847581  3.0032258            0.2920968        1.922097        5.453548
## 3      1.683922  0.8188235            0.4519608        1.145882        7.234706
##         hue od280.od315_of_diluted_wines   proline    target cluster
## 1 1.0627077                     2.803385  510.1692 1.0000000       1
## 2 1.0654839                     3.163387 1100.2258 0.0483871       2
## 3 0.6919608                     1.696667  619.0588 1.9411765       3
table(labeled_data$cluster)
## 
##  1  2  3 
## 65 62 51

Ejercicio México 2024

Importar la base de datos

dataset <- read.csv('C:\\Users\\ACER\\Downloads\\mexico2024.csv')

Escalar la base de datos

scaled_data <- scale(dataset[, -1])

Determinar el número óptimo de clusters

set.seed(123)
gap_stat <- clusGap(scaled_data, FUN = kmeans, nstart = 25, K.max = 10, B = 50)
plot(gap_stat, xlab = "Número de clusters", main = "Gap Statistic for Optimal Clusters")

Generar los clusters

n_clusters <- maxSE(gap_stat$Tab[, "gap"], gap_stat$Tab[, "SE.sim"], method = "firstSEmax")
set.seed(123)
kmeans_result <- kmeans(scaled_data, centers = n_clusters, nstart = 25)

Asignar los clusters a los datos

dataset$cluster <- as.factor(kmeans_result$cluster)

Comparar clusters

cluster_mean_mex <- aggregate(dataset[, -1], by = list(dataset$cluster), FUN = mean)
print(cluster_mean_mex)
##   Group.1 Población PIB.per.cápita Esperanza.de.vida Tasa.de.pobreza
## 1       1  7.500000      271311.50          76.00000        22.20000
## 2       2  0.900000      481697.00          75.10000        40.90000
## 3       3  6.425000       88347.25          74.02500        51.72500
## 4       4  4.366667       53849.33          73.76667        62.06667
## 5       5 17.400000       85184.00          74.50000        42.70000
## 6       6  2.100000      149812.25          76.05000        16.22500
## 7       7  2.100000      101257.00          74.72500        43.27500
## 8       8  3.222222      137203.56          75.44444        29.91111
##   Tasa.de.alfabetización cluster
## 1               98.75000      NA
## 2               96.50000      NA
## 3               94.17500      NA
## 4               88.06667      NA
## 5               97.10000      NA
## 6               98.40000      NA
## 7               94.81250      NA
## 8               97.25556      NA

Obtener mapa de México

mexico_map <- ne_states(country = "Mexico", returnclass = "sf")
mexico_clusters <- left_join(mexico_map, dataset, by = "name")
palette_colors <- c("#1f78b4", "#33a02c", "#e31a1c", "#ff7f00", "#6a3d9a")
generate_cluster_map <- function(variable, title) {
  ggplot(mexico_clusters) +
    geom_sf(aes_string(fill = variable), color = "black") +
    scale_fill_viridis_c() +  
    labs(title = title) +
    theme_minimal()
}

Mapa de México - Población

map1 <- generate_cluster_map("Población", "Clusters por Población en México")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
map1

Mapa de México - PIB per cápita

map2 <- generate_cluster_map("PIB.per.cápita", "Clusters por PIB per cápita en México")
map2

Mapa de México - Esperanza de vida

map3 <- generate_cluster_map("Esperanza.de.vida", "Clusters por Esperanza de Vida en México")
map3

Mapa de México - Tasa de pobreza

map4 <- generate_cluster_map("Tasa.de.pobreza", "Clusters por Tasa de Pobreza en México")
map4

Mapa de México - Tasa de alfabetización

map5 <- generate_cluster_map("Tasa.de.alfabetización", "Clusters por Tasa de Alfabetización en México")
map5

LS0tDQp0aXRsZTogIkNsdXN0ZXJzIg0KYXV0aG9yOiANCmRhdGU6ICIyMDI1LTAyLTE5Ig0Kb3V0cHV0OiANCiAgaHRtbF9kb2N1bWVudDoNCiAgICAgIHRvYzogVFJVRQ0KICAgICAgdG9jX2Zsb2F0OiBUUlVFDQogICAgICBjb2RlX2Rvd25sb2FkOiBUUlVFDQogICAgICB0aGVtZTogY29zbW8NCi0tLQ0KDQohW10oQzpcXFVzZXJzXFxBQ0VSXFxEb3dubG9hZHNcXHdpbmUuanBnKSAgDQoNCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IHJlZDsiPkluc3RhbGFyIHBhcXVldGVzIHkgbGxhbWFyIGxpYnJlcmlhczwvc3Bhbj4NCg0KYGBge3IgbWVzc2FnZT1GQUxTRX0NCmxpYnJhcnkoY2x1c3RlcikNCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkoZmFjdG9leHRyYSkNCmxpYnJhcnkoZGF0YS50YWJsZSkNCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KHNmKQ0KbGlicmFyeShybmF0dXJhbGVhcnRoKQ0KbGlicmFyeShybmF0dXJhbGVhcnRoZGF0YSkNCmxpYnJhcnkoZGV2dG9vbHMpDQpsaWJyYXJ5KGZhc3REdW1taWVzKQ0KYGBgDQoNCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiByZWQ7Ij5FamVtcGxvIHZpbm9zPC9zcGFuPg0KDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+Q29udGV4dG88L3NwYW4+DQoNCkVzdG9zIGRhdG9zIHNvbiBlbCByZXN1bHRhZG8gZGUgdW4gYW7DoWxpc2lzIHF1w61taWNvIGRlIHZpbm9zIGN1bHRpdmFkb3MgZW4gbGEgbWlzbWEgcmVnacOzbiBkZSBJdGFsaWEgcGVybyBkZXJpdmFkb3MgZGUgdHJlcyBjdWx0aXZhcmVzIGRpZmVyZW50ZXMuICANCg0KRWwgYW7DoWxpc2lzIGRldGVybWluw7MgbGFzIGNhbnRpZGFkZXMgZGUgMTMgY29tcG9uZW50ZXMgcXVlIHNlIGVuY3VlbnRyYW4gZW4gY2FkYSB1bm8gZGUgbG9zIHRyZXMgY3VsdGl2YXJlcy4NCg0KIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiByZWQ7Ij5JbXBvcnRhciBsYSBiYXNlIGRlIGRhdG9zPC9zcGFuPg0KDQpgYGB7cn0NCmRhdGEgPC0gcmVhZC5jc3YoJ0M6XFxVc2Vyc1xcQUNFUlxcRG93bmxvYWRzXFx3aW5lX2RhdGFzZXQuY3N2JykNCmBgYA0KDQojIyA8c3BhbiBzdHlsZT0iY29sb3I6IHJlZDsiPkVudGVuZGVyIGxhIGJhc2UgZGUgZGF0b3M8L3NwYW4+DQoNCmBgYHtyfQ0Kc3VtbWFyeShkYXRhKQ0KYGBgDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+RXNjYWxhciBsYSBiYXNlIGRlIGRhdG9zPC9zcGFuPg0KDQpgYGB7cn0NCnNjYWxlZF9kYXRhIDwtIHN1YnNldChkYXRhLCBzZWxlY3QgPSAtdGFyZ2V0KQ0Kc2NhbGVkX2RhdGEgPC0gc2NhbGUoc2NhbGVkX2RhdGEpDQpgYGANCg0KIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiByZWQ7Ij5HZW5lcmFyIGxvcyBjbHVzdGVyczwvc3Bhbj4NCg0KYGBge3J9DQpjbHVzdGVycyA8LSAzDQprbWVhbiA8LSBrbWVhbnMoc2NhbGVkX2RhdGEsIGNsdXN0ZXJzKQ0KYGBgDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+QXNpZ25hciBsb3MgY2x1c3RlcnMgYSBsb3MgZGF0b3M8L3NwYW4+DQoNCmBgYHtyfQ0KbGFiZWxlZF9kYXRhIDwtIGNiaW5kKGRhdGEsIGNsdXN0ZXIgPSBrbWVhbiRjbHVzdGVyKQ0KYGBgDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+R3LDoWZpY2FyIGxvcyBjbHVzdGVyczwvc3Bhbj4NCg0KYGBge3J9DQpmdml6X2NsdXN0ZXIoa21lYW4sIGRhdGEgPSBkYXRhKQ0KYGBgDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+T3B0aW1pemFyIGxvcyBjbHVzdGVyczwvc3Bhbj4NCg0KYGBge3J9DQojIFRoZSBvcHRpbWFsIHF1YW50aXR5IG9mIGNsdXN0ZXJzIGNvcnJlc3BvbmRzIHRvIHRoZSBmaXJzdCBoaWdoZXN0IHBvaW50IG9uIHRoZSBjaGFydA0Kc2V0LnNlZWQoMTIzKQ0Kb3B0aW1hbCA8LSBjbHVzR2FwKHNjYWxlZF9kYXRhLCBGVU4gPSBrbWVhbnMsIG5zdGFydCA9IDEsIEsubWF4ID0gMTApDQpwbG90KG9wdGltYWwsIHhsYWIgPSAnTnVtYmVyIG9mIGNsdXN0ZXJzJykNCmBgYA0KDQojIyA8c3BhbiBzdHlsZT0iY29sb3I6IHJlZDsiPkNvbXBhcmFyIGNsdXN0ZXJzPC9zcGFuPg0KDQpgYGB7cn0NCmNsdXN0ZXJfbWVhbiA8LSBhZ2dyZWdhdGUobGFiZWxlZF9kYXRhLCBieSA9IGxpc3QobGFiZWxlZF9kYXRhJGNsdXN0ZXIpLCBGVU4gPSBtZWFuKQ0KY2x1c3Rlcl9tZWFuDQp0YWJsZShsYWJlbGVkX2RhdGEkY2x1c3RlcikNCmBgYA0KDQojIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+RWplcmNpY2lvIE3DqXhpY28gMjAyNDwvc3Bhbj4NCg0KIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiByZWQ7Ij5JbXBvcnRhciBsYSBiYXNlIGRlIGRhdG9zPC9zcGFuPg0KDQpgYGB7cn0NCmRhdGFzZXQgPC0gcmVhZC5jc3YoJ0M6XFxVc2Vyc1xcQUNFUlxcRG93bmxvYWRzXFxtZXhpY28yMDI0LmNzdicpDQpgYGANCg0KIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiByZWQ7Ij5Fc2NhbGFyIGxhIGJhc2UgZGUgZGF0b3M8L3NwYW4+DQoNCmBgYHtyfQ0Kc2NhbGVkX2RhdGEgPC0gc2NhbGUoZGF0YXNldFssIC0xXSkNCmBgYA0KDQojIyA8c3BhbiBzdHlsZT0iY29sb3I6IHJlZDsiPkRldGVybWluYXIgZWwgbsO6bWVybyDDs3B0aW1vIGRlIGNsdXN0ZXJzPC9zcGFuPg0KDQpgYGB7cn0NCnNldC5zZWVkKDEyMykNCmdhcF9zdGF0IDwtIGNsdXNHYXAoc2NhbGVkX2RhdGEsIEZVTiA9IGttZWFucywgbnN0YXJ0ID0gMjUsIEsubWF4ID0gMTAsIEIgPSA1MCkNCnBsb3QoZ2FwX3N0YXQsIHhsYWIgPSAiTsO6bWVybyBkZSBjbHVzdGVycyIsIG1haW4gPSAiR2FwIFN0YXRpc3RpYyBmb3IgT3B0aW1hbCBDbHVzdGVycyIpDQpgYGANCg0KIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiByZWQ7Ij5HZW5lcmFyIGxvcyBjbHVzdGVyczwvc3Bhbj4NCg0KYGBge3J9DQpuX2NsdXN0ZXJzIDwtIG1heFNFKGdhcF9zdGF0JFRhYlssICJnYXAiXSwgZ2FwX3N0YXQkVGFiWywgIlNFLnNpbSJdLCBtZXRob2QgPSAiZmlyc3RTRW1heCIpDQpzZXQuc2VlZCgxMjMpDQprbWVhbnNfcmVzdWx0IDwtIGttZWFucyhzY2FsZWRfZGF0YSwgY2VudGVycyA9IG5fY2x1c3RlcnMsIG5zdGFydCA9IDI1KQ0KYGBgDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+QXNpZ25hciBsb3MgY2x1c3RlcnMgYSBsb3MgZGF0b3M8L3NwYW4+DQoNCmBgYHtyfQ0KZGF0YXNldCRjbHVzdGVyIDwtIGFzLmZhY3RvcihrbWVhbnNfcmVzdWx0JGNsdXN0ZXIpDQpgYGANCg0KIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiByZWQ7Ij5Db21wYXJhciBjbHVzdGVyczwvc3Bhbj4NCg0KYGBge3Igd2FybmluZz1GQUxTRX0NCmNsdXN0ZXJfbWVhbl9tZXggPC0gYWdncmVnYXRlKGRhdGFzZXRbLCAtMV0sIGJ5ID0gbGlzdChkYXRhc2V0JGNsdXN0ZXIpLCBGVU4gPSBtZWFuKQ0KcHJpbnQoY2x1c3Rlcl9tZWFuX21leCkNCmBgYA0KDQojIyA8c3BhbiBzdHlsZT0iY29sb3I6IHJlZDsiPk9idGVuZXIgbWFwYSBkZSBNw6l4aWNvPC9zcGFuPg0KDQpgYGB7cn0NCm1leGljb19tYXAgPC0gbmVfc3RhdGVzKGNvdW50cnkgPSAiTWV4aWNvIiwgcmV0dXJuY2xhc3MgPSAic2YiKQ0KYGBgDQoNCmBgYHtyfQ0KbWV4aWNvX2NsdXN0ZXJzIDwtIGxlZnRfam9pbihtZXhpY29fbWFwLCBkYXRhc2V0LCBieSA9ICJuYW1lIikNCnBhbGV0dGVfY29sb3JzIDwtIGMoIiMxZjc4YjQiLCAiIzMzYTAyYyIsICIjZTMxYTFjIiwgIiNmZjdmMDAiLCAiIzZhM2Q5YSIpDQpgYGANCg0KYGBge3J9DQpnZW5lcmF0ZV9jbHVzdGVyX21hcCA8LSBmdW5jdGlvbih2YXJpYWJsZSwgdGl0bGUpIHsNCiAgZ2dwbG90KG1leGljb19jbHVzdGVycykgKw0KICAgIGdlb21fc2YoYWVzX3N0cmluZyhmaWxsID0gdmFyaWFibGUpLCBjb2xvciA9ICJibGFjayIpICsNCiAgICBzY2FsZV9maWxsX3ZpcmlkaXNfYygpICsgIA0KICAgIGxhYnModGl0bGUgPSB0aXRsZSkgKw0KICAgIHRoZW1lX21pbmltYWwoKQ0KfQ0KYGBgDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+TWFwYSBkZSBNw6l4aWNvIC0gUG9ibGFjacOzbjwvc3Bhbj4NCg0KYGBge3J9DQptYXAxIDwtIGdlbmVyYXRlX2NsdXN0ZXJfbWFwKCJQb2JsYWNpw7NuIiwgIkNsdXN0ZXJzIHBvciBQb2JsYWNpw7NuIGVuIE3DqXhpY28iKQ0KbWFwMQ0KYGBgDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+TWFwYSBkZSBNw6l4aWNvIC0gUElCIHBlciBjw6FwaXRhPC9zcGFuPg0KDQpgYGB7cn0NCm1hcDIgPC0gZ2VuZXJhdGVfY2x1c3Rlcl9tYXAoIlBJQi5wZXIuY8OhcGl0YSIsICJDbHVzdGVycyBwb3IgUElCIHBlciBjw6FwaXRhIGVuIE3DqXhpY28iKQ0KbWFwMg0KYGBgDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+TWFwYSBkZSBNw6l4aWNvIC0gRXNwZXJhbnphIGRlIHZpZGE8L3NwYW4+DQoNCmBgYHtyfQ0KbWFwMyA8LSBnZW5lcmF0ZV9jbHVzdGVyX21hcCgiRXNwZXJhbnphLmRlLnZpZGEiLCAiQ2x1c3RlcnMgcG9yIEVzcGVyYW56YSBkZSBWaWRhIGVuIE3DqXhpY28iKQ0KbWFwMw0KYGBgDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+TWFwYSBkZSBNw6l4aWNvIC0gVGFzYSBkZSBwb2JyZXphPC9zcGFuPg0KDQpgYGB7cn0NCm1hcDQgPC0gZ2VuZXJhdGVfY2x1c3Rlcl9tYXAoIlRhc2EuZGUucG9icmV6YSIsICJDbHVzdGVycyBwb3IgVGFzYSBkZSBQb2JyZXphIGVuIE3DqXhpY28iKQ0KbWFwNA0KYGBgDQoNCiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyI+TWFwYSBkZSBNw6l4aWNvIC0gVGFzYSBkZSBhbGZhYmV0aXphY2nDs248L3NwYW4+DQoNCmBgYHtyfQ0KbWFwNSA8LSBnZW5lcmF0ZV9jbHVzdGVyX21hcCgiVGFzYS5kZS5hbGZhYmV0aXphY2nDs24iLCAiQ2x1c3RlcnMgcG9yIFRhc2EgZGUgQWxmYWJldGl6YWNpw7NuIGVuIE3DqXhpY28iKQ0KbWFwNQ0KYGBgDQoNCg==