Conexto

La base de datos “USArrest” contiene estadisticas en arrestos por cada 100,000 residentes por agresión, asesinato y violacion en cada uno de los 50 estados de EE. UU. en 1973.

Cargar liberias

#install.packages("cluster")
library(cluster)
#install.packages("ggplot2")
library(ggplot2)

#install.packages("data.table")
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.2
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("maps")
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:cluster':
## 
##     votes.repub
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- USArrests

Revisar numero de clusters óptimo

summary(df)
##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00
dfscaled <- scale(df)

set.seed(123)
opt <- clusGap(dfscaled, FUN = kmeans, nstart=1, K.max=8)
plot(opt, xlab="Numero de clusters k")

Creación de Clusters

grupo <- 4

seg <- kmeans(df, grupo)
seg
## K-means clustering with 4 clusters of sizes 18, 12, 13, 7
## 
## Cluster means:
##      Murder   Assault UrbanPop     Rape
## 1  4.161111  83.94444 57.77778 13.52778
## 2 12.033333 239.25000 69.16667 28.50000
## 3  6.730769 157.69231 71.23077 21.08462
## 4 11.800000 300.85714 68.71429 28.85714
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              2              2              4              3              4 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              1              2              4              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              1              1              2              1              1 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              1              1              2              1              4 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              2              1              2              3 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              1              1              2              1              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              4              2              4              1              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              1              3              4 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              1              3              2              3              1 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              1              1              3 
## 
## Within cluster sum of squares by cluster:
## [1] 16065.634  8687.183  7955.062  6023.923
##  (between_SS / total_SS =  89.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
clusfila <- cbind(df, cluster = seg$cluster)
clusfila
##                Murder Assault UrbanPop Rape cluster
## Alabama          13.2     236       58 21.2       2
## Alaska           10.0     263       48 44.5       2
## Arizona           8.1     294       80 31.0       4
## Arkansas          8.8     190       50 19.5       3
## California        9.0     276       91 40.6       4
## Colorado          7.9     204       78 38.7       2
## Connecticut       3.3     110       77 11.1       1
## Delaware          5.9     238       72 15.8       2
## Florida          15.4     335       80 31.9       4
## Georgia          17.4     211       60 25.8       2
## Hawaii            5.3      46       83 20.2       1
## Idaho             2.6     120       54 14.2       1
## Illinois         10.4     249       83 24.0       2
## Indiana           7.2     113       65 21.0       1
## Iowa              2.2      56       57 11.3       1
## Kansas            6.0     115       66 18.0       1
## Kentucky          9.7     109       52 16.3       1
## Louisiana        15.4     249       66 22.2       2
## Maine             2.1      83       51  7.8       1
## Maryland         11.3     300       67 27.8       4
## Massachusetts     4.4     149       85 16.3       3
## Michigan         12.1     255       74 35.1       2
## Minnesota         2.7      72       66 14.9       1
## Mississippi      16.1     259       44 17.1       2
## Missouri          9.0     178       70 28.2       3
## Montana           6.0     109       53 16.4       1
## Nebraska          4.3     102       62 16.5       1
## Nevada           12.2     252       81 46.0       2
## New Hampshire     2.1      57       56  9.5       1
## New Jersey        7.4     159       89 18.8       3
## New Mexico       11.4     285       70 32.1       4
## New York         11.1     254       86 26.1       2
## North Carolina   13.0     337       45 16.1       4
## North Dakota      0.8      45       44  7.3       1
## Ohio              7.3     120       75 21.4       3
## Oklahoma          6.6     151       68 20.0       3
## Oregon            4.9     159       67 29.3       3
## Pennsylvania      6.3     106       72 14.9       1
## Rhode Island      3.4     174       87  8.3       3
## South Carolina   14.4     279       48 22.5       4
## South Dakota      3.8      86       45 12.8       1
## Tennessee        13.2     188       59 26.9       3
## Texas            12.7     201       80 25.5       2
## Utah              3.2     120       80 22.9       3
## Vermont           2.2      48       32 11.2       1
## Virginia          8.5     156       63 20.7       3
## Washington        4.0     145       73 26.2       3
## West Virginia     5.7      81       39  9.3       1
## Wisconsin         2.6      53       66 10.8       1
## Wyoming           6.8     161       60 15.6       3

Visualización de Clusters

fviz_cluster(seg, data = df)

Filtración por clusters

clusfila <- as.data.frame(clusfila)
cluster1 <- filter(clusfila, cluster == 1)
cluster2 <- filter(clusfila, cluster == 2)
cluster3 <- filter(clusfila, cluster == 3)
cluster4 <- filter(clusfila, cluster == 4)

Determinar los clusters con mayor peligro y menor en base a promedio

mean_assault1 <- mean(cluster1$Assault)
mean_assault2 <- mean(cluster2$Assault)
mean_assault3 <- mean(cluster3$Assault)
mean_assault4 <- mean(cluster4$Assault)

mean_assault4 
## [1] 300.8571
mean_assault2 
## [1] 239.25
mean_assault3
## [1] 157.6923
mean_assault1 
## [1] 83.94444

Renombrar clusters con color aproiado

rojo <- filter(clusfila, cluster == 4)
naranja <- filter(clusfila, cluster == 2)
amarillo <- filter(clusfila, cluster == 3)
verde <- filter(clusfila, cluster == 1)

Ajuste para poder útlizar los nombres de estados

rojo$State <- rownames(rojo)
naranja$State <- rownames(naranja)
amarillo$State <- rownames(amarillo)
verde$State <- rownames(verde)

Creación de mapa

map(database = "state")

rojo_mapa <- rojo$State
map(database = "state", regions = rojo_mapa, col = "red", fill = T, add = TRUE)


naranja_mapa <- naranja$State
map(database = "state", regions = naranja_mapa, col = "orange", fill = T, add = TRUE)


amarillo_mapa <- amarillo$State
map(database = "state", regions = amarillo_mapa, col = "yellow", fill = T, add = TRUE)

verde_mapa <- verde$State
map(database = "state", regions = verde_mapa, col = "green", fill = T, add = TRUE)

legend("topright", legend = c("Más Seguro", "Más Inseguro", "Seguro", "Inseguro"), 
       fill = c("green", "red", "yellow", "orange"))

La segmentación o clusters es un algoritmo útil para identificar el nivel de criminalidad en diferentes regiones de Estados Unidos. Dentro del análisis sobre los datos de arrestos en el país en 1973, se ha identificado que el Cluster 1 representa los estados más seguros, mientras que el Cluster 2 se caracteriza por ser el más inseguro. Por otro lado, el Cluster 3 muestra un nivel de inseguridad menor en comparación con el promedio, mientras que el Cluster 4 es catalogado como menos seguro en términos generales. Estos resultados muestran una tendencia a tener mayor nivel de inseguridad en los estados del sur, mientras los del norte suelen ser más seguros.
LS0tCnRpdGxlOiAiTWFwYSBkZSBQZWxpZ3JvIENsdXN0ZXJzIgphdXRob3I6ICJHaWxiZXJ0byBNZW5jaGFjYSBBMDExNzc4OTkiCmRhdGU6ICIyMDI0LTAyLTIwIgpvdXRwdXQ6IAogIGh0bWxfZG9jdW1lbnQ6CiAgICB0b2M6IFRSVUUKICAgIHRvY19mbG9hdDogVFJVRQogICAgY29kZV9kb3dubG9hZDogVFJVRQogICAgdGhlbWU6IHNpbXBsZXgKLS0tCiFbXSgvVXNlcnMvbGlnaHRlZGl0L0RvY3VtZW50cy9URUMgU0VNRVNUUkUgNi4xL00yL1IvaW5zZWd1cmlkYWQtcGlzdG9sYS1hcm1hLmpwZykKCiMgQ29uZXh0bwpMYSBiYXNlIGRlIGRhdG9zICJVU0FycmVzdCIgY29udGllbmUgZXN0YWRpc3RpY2FzIGVuIGFycmVzdG9zIHBvciBjYWRhIDEwMCwwMDAgcmVzaWRlbnRlcyBwb3IgYWdyZXNpw7NuLCBhc2VzaW5hdG8geSB2aW9sYWNpb24gZW4gY2FkYSB1bm8gZGUgbG9zIDUwIGVzdGFkb3MgZGUgRUUuIFVVLiBlbiAxOTczLgoKCiMgQ2FyZ2FyIGxpYmVyaWFzCgpgYGB7cn0KI2luc3RhbGwucGFja2FnZXMoImNsdXN0ZXIiKQpsaWJyYXJ5KGNsdXN0ZXIpCiNpbnN0YWxsLnBhY2thZ2VzKCJnZ3Bsb3QyIikKbGlicmFyeShnZ3Bsb3QyKQoKI2luc3RhbGwucGFja2FnZXMoImRhdGEudGFibGUiKQpsaWJyYXJ5KGRhdGEudGFibGUpCgojaW5zdGFsbC5wYWNrYWdlcygiZmFjdG9leHRyYSIpCmxpYnJhcnkoZmFjdG9leHRyYSkKCiNpbnN0YWxsLnBhY2thZ2VzKCJtYXBzIikKbGlicmFyeShtYXBzKQpsaWJyYXJ5KGRwbHlyKQoKZGYgPC0gVVNBcnJlc3RzCmBgYAoKIyBSZXZpc2FyIG51bWVybyBkZSBjbHVzdGVycyDDs3B0aW1vCgpgYGB7cn0Kc3VtbWFyeShkZikKCmRmc2NhbGVkIDwtIHNjYWxlKGRmKQoKc2V0LnNlZWQoMTIzKQpvcHQgPC0gY2x1c0dhcChkZnNjYWxlZCwgRlVOID0ga21lYW5zLCBuc3RhcnQ9MSwgSy5tYXg9OCkKcGxvdChvcHQsIHhsYWI9Ik51bWVybyBkZSBjbHVzdGVycyBrIikKYGBgCgoKIyBDcmVhY2nDs24gZGUgQ2x1c3RlcnMKYGBge3J9CmdydXBvIDwtIDQKCnNlZyA8LSBrbWVhbnMoZGYsIGdydXBvKQpzZWcKCmNsdXNmaWxhIDwtIGNiaW5kKGRmLCBjbHVzdGVyID0gc2VnJGNsdXN0ZXIpCmNsdXNmaWxhCgpgYGAKCiMgVmlzdWFsaXphY2nDs24gZGUgQ2x1c3RlcnMKCmBgYHtyfQpmdml6X2NsdXN0ZXIoc2VnLCBkYXRhID0gZGYpCmBgYAoKIyBGaWx0cmFjacOzbiBwb3IgY2x1c3RlcnMKYGBge3J9CmNsdXNmaWxhIDwtIGFzLmRhdGEuZnJhbWUoY2x1c2ZpbGEpCmNsdXN0ZXIxIDwtIGZpbHRlcihjbHVzZmlsYSwgY2x1c3RlciA9PSAxKQpjbHVzdGVyMiA8LSBmaWx0ZXIoY2x1c2ZpbGEsIGNsdXN0ZXIgPT0gMikKY2x1c3RlcjMgPC0gZmlsdGVyKGNsdXNmaWxhLCBjbHVzdGVyID09IDMpCmNsdXN0ZXI0IDwtIGZpbHRlcihjbHVzZmlsYSwgY2x1c3RlciA9PSA0KQpgYGAKCiMgRGV0ZXJtaW5hciBsb3MgY2x1c3RlcnMgY29uIG1heW9yIHBlbGlncm8geSBtZW5vciBlbiBiYXNlIGEgcHJvbWVkaW8KCmBgYHtyfQptZWFuX2Fzc2F1bHQxIDwtIG1lYW4oY2x1c3RlcjEkQXNzYXVsdCkKbWVhbl9hc3NhdWx0MiA8LSBtZWFuKGNsdXN0ZXIyJEFzc2F1bHQpCm1lYW5fYXNzYXVsdDMgPC0gbWVhbihjbHVzdGVyMyRBc3NhdWx0KQptZWFuX2Fzc2F1bHQ0IDwtIG1lYW4oY2x1c3RlcjQkQXNzYXVsdCkKCm1lYW5fYXNzYXVsdDQgCm1lYW5fYXNzYXVsdDIgCm1lYW5fYXNzYXVsdDMKbWVhbl9hc3NhdWx0MSAKYGBgCiMgUmVub21icmFyIGNsdXN0ZXJzIGNvbiBjb2xvciBhcHJvaWFkbwoKYGBge3J9CnJvam8gPC0gZmlsdGVyKGNsdXNmaWxhLCBjbHVzdGVyID09IDQpCm5hcmFuamEgPC0gZmlsdGVyKGNsdXNmaWxhLCBjbHVzdGVyID09IDIpCmFtYXJpbGxvIDwtIGZpbHRlcihjbHVzZmlsYSwgY2x1c3RlciA9PSAzKQp2ZXJkZSA8LSBmaWx0ZXIoY2x1c2ZpbGEsIGNsdXN0ZXIgPT0gMSkKCmBgYAoKIyBBanVzdGUgcGFyYSBwb2RlciDDunRsaXphciBsb3Mgbm9tYnJlcyBkZSBlc3RhZG9zCgpgYGB7cn0Kcm9qbyRTdGF0ZSA8LSByb3duYW1lcyhyb2pvKQpuYXJhbmphJFN0YXRlIDwtIHJvd25hbWVzKG5hcmFuamEpCmFtYXJpbGxvJFN0YXRlIDwtIHJvd25hbWVzKGFtYXJpbGxvKQp2ZXJkZSRTdGF0ZSA8LSByb3duYW1lcyh2ZXJkZSkKYGBgCgojIENyZWFjacOzbiBkZSBtYXBhIAoKYGBge3J9Cm1hcChkYXRhYmFzZSA9ICJzdGF0ZSIpCgpyb2pvX21hcGEgPC0gcm9qbyRTdGF0ZQptYXAoZGF0YWJhc2UgPSAic3RhdGUiLCByZWdpb25zID0gcm9qb19tYXBhLCBjb2wgPSAicmVkIiwgZmlsbCA9IFQsIGFkZCA9IFRSVUUpCgoKbmFyYW5qYV9tYXBhIDwtIG5hcmFuamEkU3RhdGUKbWFwKGRhdGFiYXNlID0gInN0YXRlIiwgcmVnaW9ucyA9IG5hcmFuamFfbWFwYSwgY29sID0gIm9yYW5nZSIsIGZpbGwgPSBULCBhZGQgPSBUUlVFKQoKCmFtYXJpbGxvX21hcGEgPC0gYW1hcmlsbG8kU3RhdGUKbWFwKGRhdGFiYXNlID0gInN0YXRlIiwgcmVnaW9ucyA9IGFtYXJpbGxvX21hcGEsIGNvbCA9ICJ5ZWxsb3ciLCBmaWxsID0gVCwgYWRkID0gVFJVRSkKCnZlcmRlX21hcGEgPC0gdmVyZGUkU3RhdGUKbWFwKGRhdGFiYXNlID0gInN0YXRlIiwgcmVnaW9ucyA9IHZlcmRlX21hcGEsIGNvbCA9ICJncmVlbiIsIGZpbGwgPSBULCBhZGQgPSBUUlVFKQoKbGVnZW5kKCJ0b3ByaWdodCIsIGxlZ2VuZCA9IGMoIk3DoXMgU2VndXJvIiwgIk3DoXMgSW5zZWd1cm8iLCAiU2VndXJvIiwgIkluc2VndXJvIiksIAogICAgICAgZmlsbCA9IGMoImdyZWVuIiwgInJlZCIsICJ5ZWxsb3ciLCAib3JhbmdlIikpCgoKYGBgCgoKPGRpdiBzdHlsZT0idGV4dC1hbGlnbjoganVzdGlmeSI+CkxhICoqc2VnbWVudGFjacOzbioqIG8gKmNsdXN0ZXJzKiBlcyB1biBhbGdvcml0bW8gw7p0aWwgcGFyYSBpZGVudGlmaWNhciBlbCBuaXZlbCBkZSBjcmltaW5hbGlkYWQgZW4gZGlmZXJlbnRlcyByZWdpb25lcyBkZSBFc3RhZG9zIFVuaWRvcy4gRGVudHJvIGRlbCBhbsOhbGlzaXMgc29icmUgbG9zIGRhdG9zIGRlIGFycmVzdG9zIGVuIGVsIHBhw61zIGVuIDE5NzMsIHNlIGhhIGlkZW50aWZpY2FkbyBxdWUgZWwgQ2x1c3RlciAxIHJlcHJlc2VudGEgbG9zIGVzdGFkb3MgbcOhcyBzZWd1cm9zLCBtaWVudHJhcyBxdWUgZWwgQ2x1c3RlciAyIHNlIGNhcmFjdGVyaXphIHBvciBzZXIgZWwgbcOhcyBpbnNlZ3Vyby4gUG9yIG90cm8gbGFkbywgZWwgQ2x1c3RlciAzIG11ZXN0cmEgdW4gbml2ZWwgZGUgaW5zZWd1cmlkYWQgbWVub3IgZW4gY29tcGFyYWNpw7NuIGNvbiBlbCBwcm9tZWRpbywgbWllbnRyYXMgcXVlIGVsIENsdXN0ZXIgNCBlcyBjYXRhbG9nYWRvIGNvbW8gbWVub3Mgc2VndXJvIGVuIHTDqXJtaW5vcyBnZW5lcmFsZXMuIEVzdG9zIHJlc3VsdGFkb3MgbXVlc3RyYW4gdW5hIHRlbmRlbmNpYSBhIHRlbmVyIG1heW9yIG5pdmVsIGRlIGluc2VndXJpZGFkIGVuIGxvcyBlc3RhZG9zIGRlbCBzdXIsIG1pZW50cmFzIGxvcyBkZWwgbm9ydGUgc3VlbGVuIHNlciBtw6FzIHNlZ3Vyb3MuPGRpdi8+CgoKCgoKCg==