Contexto

La base de datos “USArrests” contiene estadísticas en arrestos por cada 100,000 residentes por agresión, asesinato y violación en cada uno de los 50 estados de EE.UU. en 1973.

Paso 1. Instalar paquetes y llamar librerías

#install.packages("cluster")
library(cluster)
## Warning: package 'cluster' was built under R version 4.3.2
#install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
#install.packages("data.table")
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.2
#install.packages("factoextra")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(maps)
## Warning: package 'maps' was built under R version 4.3.2
## 
## Attaching package: 'maps'
## The following object is masked from 'package:cluster':
## 
##     votes.repub
library(tibble)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Paso 2. Obtener los datos

datos <- USArrests
head(datos)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7
summary(datos)
##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

Paso 3. Escalar las variables

df <- scale(datos)

Paso 4. Optimizar la cantidad de grupos

La cantidad óptima de grupos corresponde al punto más alto de la siguiente gráfica:

set.seed(123)

optimizacion <- clusGap(df, FUN=kmeans, nstart=1, K.max = 8)
plot(optimizacion, xlab = "Número de clusters k")

Paso 5. Cantidad de grupos

grupos <- 4

Paso 6. Generar los segmentos

segmentos <- kmeans(df, grupos)
segmentos
## K-means clustering with 4 clusters of sizes 13, 8, 16, 13
## 
## Cluster means:
##       Murder    Assault   UrbanPop        Rape
## 1 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 2  1.4118898  0.8743346 -0.8145211  0.01927104
## 3 -0.4894375 -0.3826001  0.5758298 -0.26165379
## 4  0.6950701  1.0394414  0.7226370  1.27693964
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              2              4              4              2              4 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              4              3              3              4              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              1              4              3              1 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              1              2              1              4 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              4              1              2              4 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              1              1              4              1              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              4              4              2              1              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              2 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              1              2              4              3              1 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              1              1              3 
## 
## Within cluster sum of squares by cluster:
## [1] 11.952463  8.316061 16.212213 19.922437
##  (between_SS / total_SS =  71.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Paso 7. Asignar el grupo al que pertenece cada observación

asignacion <- cbind(cluster = segmentos$cluster, datos)
head(asignacion)
##            cluster Murder Assault UrbanPop Rape
## Alabama          2   13.2     236       58 21.2
## Alaska           4   10.0     263       48 44.5
## Arizona          4    8.1     294       80 31.0
## Arkansas         2    8.8     190       50 19.5
## California       4    9.0     276       91 40.6
## Colorado         4    7.9     204       78 38.7
#asignacion$cluster <- as.factor(asignacion$cluster)
#summary(asignacion$cluster)

Paso 8. Graficar los clusters

fviz_cluster(segmentos, data = datos)

Paso 9. Comparar segmentos

promedio <- aggregate(asignacion, by = list(asignacion$cluster), FUN=mean)
promedio
##   Group.1 cluster   Murder   Assault UrbanPop     Rape
## 1       1       1  3.60000  78.53846 52.07692 12.17692
## 2       2       2 13.93750 243.62500 53.75000 21.41250
## 3       3       3  5.65625 138.87500 73.87500 18.78125
## 4       4       4 10.81538 257.38462 76.00000 33.19231

Paso 10. Colorear el mapa

USArrests <- rownames_to_column(asignacion, "State")
head(USArrests)
##        State cluster Murder Assault UrbanPop Rape
## 1    Alabama       2   13.2     236       58 21.2
## 2     Alaska       4   10.0     263       48 44.5
## 3    Arizona       4    8.1     294       80 31.0
## 4   Arkansas       2    8.8     190       50 19.5
## 5 California       4    9.0     276       91 40.6
## 6   Colorado       4    7.9     204       78 38.7
cluster1 <- filter(USArrests, cluster == 1)
cluster2 <- filter(USArrests, cluster == 2)
cluster3 <- filter(USArrests, cluster == 3)
cluster4 <- filter(USArrests, cluster == 4)

map(database = "state")
map(database = "state", regions = cluster1$State, col="green", fill=TRUE, add = TRUE)
map(database = "state", regions = cluster2$State, col="red", fill=TRUE, add = TRUE)
map(database = "state", regions = cluster3$State, col="yellow", fill=TRUE, add = TRUE)
map(database = "state", regions = cluster4$State, col="orange", fill=TRUE, add = TRUE)
legend("topright", 
       legend = c("Muy Seguro", "Seguro", "Poco Seguro", "Inseguro"), 
       fill = c("green", "yellow", "orange", "red"),
       x = "bottomright", y = 1
)

Conclusión

Se identificó el nivel de criminalidad en los diferentes Estados de Estados Unidos y se crearon 4 divisiones en una escala del más seguro al menos seguro.

LS0tDQp0aXRsZTogIlVTQXJyZXN0cyAoY2x1c3RlcnMpIg0KYXV0aG9yOiBLYXJsYSBTw6FuY2hleiAtIEEwMTE5ODE4NCwgTmF5ZWxpIFBlw7FhIC0gQTAxMzY4NTE2LCBLYXRoaWEgUnVpeiAtIEEwMTU3MTA5NCwgSmF6bWluIENvcnRleiAtIEEwMDgzMTEwNQ0KZGF0ZTogIjIwMjQtMDItMjAiDQpvdXRwdXQ6DQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdG9jOiBUUlVFDQogICAgdG9jX2Zsb2F0OiBUUlVFDQogICAgY29kZV9kb3dubG9hZDogVFJVRQ0KLS0tDQoNCg0KIVtdKEM6XFxVc2Vyc1xca2F0aGlcXE9uZURyaXZlXFxFc2NyaXRvcmlvXFxNMl9JQSBjb24gSW1wYWN0byBFbXByZXNhcmlhbFxcdXNhZmxhZy5naWYpDQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSkNCmBgYA0KDQoNCiMgQ29udGV4dG8NCkxhIGJhc2UgZGUgZGF0b3MgIipVU0FycmVzdHMqIiBjb250aWVuZSBlc3RhZMOtc3RpY2FzIGVuIGFycmVzdG9zIHBvciBjYWRhIDEwMCwwMDAgcmVzaWRlbnRlcyBwb3IgYWdyZXNpw7NuLCBhc2VzaW5hdG8geSB2aW9sYWNpw7NuIGVuIGNhZGEgdW5vIGRlIGxvcyA1MCBlc3RhZG9zIGRlIEVFLlVVLiBlbiAxOTczLg0KDQoNCiMgUGFzbyAxLiBJbnN0YWxhciBwYXF1ZXRlcyB5IGxsYW1hciBsaWJyZXLDrWFzDQoNCmBgYHtyIH0NCiNpbnN0YWxsLnBhY2thZ2VzKCJjbHVzdGVyIikNCmxpYnJhcnkoY2x1c3RlcikNCiNpbnN0YWxsLnBhY2thZ2VzKCJnZ3Bsb3QyIikNCmxpYnJhcnkoZ2dwbG90MikNCiNpbnN0YWxsLnBhY2thZ2VzKCJkYXRhLnRhYmxlIikNCmxpYnJhcnkoZGF0YS50YWJsZSkNCiNpbnN0YWxsLnBhY2thZ2VzKCJmYWN0b2V4dHJhIikNCmxpYnJhcnkoZmFjdG9leHRyYSkNCmxpYnJhcnkobWFwcykNCmxpYnJhcnkodGliYmxlKQ0KbGlicmFyeShkcGx5cikNCmBgYA0KDQoNCiMgUGFzbyAyLiBPYnRlbmVyIGxvcyBkYXRvcw0KDQpgYGB7ciB9DQpkYXRvcyA8LSBVU0FycmVzdHMNCmhlYWQoZGF0b3MpDQpzdW1tYXJ5KGRhdG9zKQ0KYGBgDQoNCiMgUGFzbyAzLiBFc2NhbGFyIGxhcyB2YXJpYWJsZXMNCmBgYHtyIH0NCmRmIDwtIHNjYWxlKGRhdG9zKQ0KYGBgDQoNCiMgUGFzbyA0LiBPcHRpbWl6YXIgbGEgY2FudGlkYWQgZGUgZ3J1cG9zDQoNCkxhIGNhbnRpZGFkIMOzcHRpbWEgZGUgZ3J1cG9zIGNvcnJlc3BvbmRlIGFsIHB1bnRvIG3DoXMgYWx0byBkZSBsYSBzaWd1aWVudGUgZ3LDoWZpY2E6DQoNCg0KYGBge3IgfQ0Kc2V0LnNlZWQoMTIzKQ0KDQpvcHRpbWl6YWNpb24gPC0gY2x1c0dhcChkZiwgRlVOPWttZWFucywgbnN0YXJ0PTEsIEsubWF4ID0gOCkNCnBsb3Qob3B0aW1pemFjaW9uLCB4bGFiID0gIk7Dum1lcm8gZGUgY2x1c3RlcnMgayIpDQpgYGANCg0KIyBQYXNvIDUuIENhbnRpZGFkIGRlIGdydXBvcw0KDQpgYGB7ciB9DQpncnVwb3MgPC0gNA0KYGBgDQoNCg0KIyBQYXNvIDYuIEdlbmVyYXIgbG9zIHNlZ21lbnRvcw0KDQpgYGB7ciB9DQpzZWdtZW50b3MgPC0ga21lYW5zKGRmLCBncnVwb3MpDQpzZWdtZW50b3MNCmBgYA0KDQojIFBhc28gNy4gQXNpZ25hciBlbCBncnVwbyBhbCBxdWUgcGVydGVuZWNlIGNhZGEgb2JzZXJ2YWNpw7NuDQoNCmBgYHtyIH0NCmFzaWduYWNpb24gPC0gY2JpbmQoY2x1c3RlciA9IHNlZ21lbnRvcyRjbHVzdGVyLCBkYXRvcykNCmhlYWQoYXNpZ25hY2lvbikNCg0KI2FzaWduYWNpb24kY2x1c3RlciA8LSBhcy5mYWN0b3IoYXNpZ25hY2lvbiRjbHVzdGVyKQ0KI3N1bW1hcnkoYXNpZ25hY2lvbiRjbHVzdGVyKQ0KYGBgDQoNCiMgUGFzbyA4LiBHcmFmaWNhciBsb3MgY2x1c3RlcnMNCg0KYGBge3IgfQ0KZnZpel9jbHVzdGVyKHNlZ21lbnRvcywgZGF0YSA9IGRhdG9zKQ0KYGBgDQoNCg0KIyBQYXNvIDkuIENvbXBhcmFyIHNlZ21lbnRvcw0KDQpgYGB7ciB9DQpwcm9tZWRpbyA8LSBhZ2dyZWdhdGUoYXNpZ25hY2lvbiwgYnkgPSBsaXN0KGFzaWduYWNpb24kY2x1c3RlciksIEZVTj1tZWFuKQ0KcHJvbWVkaW8NCg0KYGBgDQoNCg0KDQojIFBhc28gMTAuIENvbG9yZWFyIGVsIG1hcGENCg0KYGBge3IgfQ0KVVNBcnJlc3RzIDwtIHJvd25hbWVzX3RvX2NvbHVtbihhc2lnbmFjaW9uLCAiU3RhdGUiKQ0KaGVhZChVU0FycmVzdHMpDQpjbHVzdGVyMSA8LSBmaWx0ZXIoVVNBcnJlc3RzLCBjbHVzdGVyID09IDEpDQpjbHVzdGVyMiA8LSBmaWx0ZXIoVVNBcnJlc3RzLCBjbHVzdGVyID09IDIpDQpjbHVzdGVyMyA8LSBmaWx0ZXIoVVNBcnJlc3RzLCBjbHVzdGVyID09IDMpDQpjbHVzdGVyNCA8LSBmaWx0ZXIoVVNBcnJlc3RzLCBjbHVzdGVyID09IDQpDQoNCm1hcChkYXRhYmFzZSA9ICJzdGF0ZSIpDQptYXAoZGF0YWJhc2UgPSAic3RhdGUiLCByZWdpb25zID0gY2x1c3RlcjEkU3RhdGUsIGNvbD0iZ3JlZW4iLCBmaWxsPVRSVUUsIGFkZCA9IFRSVUUpDQptYXAoZGF0YWJhc2UgPSAic3RhdGUiLCByZWdpb25zID0gY2x1c3RlcjIkU3RhdGUsIGNvbD0icmVkIiwgZmlsbD1UUlVFLCBhZGQgPSBUUlVFKQ0KbWFwKGRhdGFiYXNlID0gInN0YXRlIiwgcmVnaW9ucyA9IGNsdXN0ZXIzJFN0YXRlLCBjb2w9InllbGxvdyIsIGZpbGw9VFJVRSwgYWRkID0gVFJVRSkNCm1hcChkYXRhYmFzZSA9ICJzdGF0ZSIsIHJlZ2lvbnMgPSBjbHVzdGVyNCRTdGF0ZSwgY29sPSJvcmFuZ2UiLCBmaWxsPVRSVUUsIGFkZCA9IFRSVUUpDQpsZWdlbmQoInRvcHJpZ2h0IiwgDQogICAgICAgbGVnZW5kID0gYygiTXV5IFNlZ3VybyIsICJTZWd1cm8iLCAiUG9jbyBTZWd1cm8iLCAiSW5zZWd1cm8iKSwgDQogICAgICAgZmlsbCA9IGMoImdyZWVuIiwgInllbGxvdyIsICJvcmFuZ2UiLCAicmVkIiksDQogICAgICAgeCA9ICJib3R0b21yaWdodCIsIHkgPSAxDQopDQoNCg0KYGBgDQoNCg0KIyBDb25jbHVzacOzbg0KU2UgaWRlbnRpZmljw7MgZWwgbml2ZWwgZGUgY3JpbWluYWxpZGFkIGVuIGxvcyBkaWZlcmVudGVzIEVzdGFkb3MgZGUgRXN0YWRvcyBVbmlkb3MgeSBzZSBjcmVhcm9uIDQgZGl2aXNpb25lcyBlbiB1bmEgZXNjYWxhIGRlbCBtw6FzIHNlZ3VybyBhbCBtZW5vcyBzZWd1cm8uDQo=