Contexto

La base de. datos “USArrests” contiene estadisticas en arrestos por cada 100,000 resudentes por agresion, asesinato y violacion en cada uno de los 50 estados de EE.UU en 1973.

Fuente: USA Arrests Dataset

Paso 1. Instalar paquetes y llamar librerias

#install.packages("maps")
library(maps)
## Warning: package 'maps' was built under R version 4.3.1
#install.packages("cluster")
library(cluster)
## 
## Attaching package: 'cluster'
## The following object is masked from 'package:maps':
## 
##     votes.repub
#install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
#install.packages("data.table")
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.1
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Paso 2. Base de datos y Analizar Variables

datos <- USArrests
summary(datos)
##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

Paso 3. Escalar

df <- scale(datos)

Paso 4. Generar grupos

set.seed(123)
grupos <- 4
segmentos <- kmeans(df,grupos)
segmentos
## K-means clustering with 4 clusters of sizes 8, 13, 16, 13
## 
## Cluster means:
##       Murder    Assault   UrbanPop        Rape
## 1  1.4118898  0.8743346 -0.8145211  0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001  0.5758298 -0.26165379
## 4  0.6950701  1.0394414  0.7226370  1.27693964
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              4              4              1              4 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              4              3              3              4              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              2              4              3              2 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              2              1              2              4 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              4              2              1              4 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              2              2              4              2              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              4              4              1              2              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              2              1              4              3              2 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              2              2              3 
## 
## Within cluster sum of squares by cluster:
## [1]  8.316061 11.952463 16.212213 19.922437
##  (between_SS / total_SS =  71.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Paso 5. Asignar a los grupos

asignacion <- cbind(datos, cluster = segmentos$cluster)
asignacion
##                Murder Assault UrbanPop Rape cluster
## Alabama          13.2     236       58 21.2       1
## Alaska           10.0     263       48 44.5       4
## Arizona           8.1     294       80 31.0       4
## Arkansas          8.8     190       50 19.5       1
## California        9.0     276       91 40.6       4
## Colorado          7.9     204       78 38.7       4
## Connecticut       3.3     110       77 11.1       3
## Delaware          5.9     238       72 15.8       3
## Florida          15.4     335       80 31.9       4
## Georgia          17.4     211       60 25.8       1
## Hawaii            5.3      46       83 20.2       3
## Idaho             2.6     120       54 14.2       2
## Illinois         10.4     249       83 24.0       4
## Indiana           7.2     113       65 21.0       3
## Iowa              2.2      56       57 11.3       2
## Kansas            6.0     115       66 18.0       3
## Kentucky          9.7     109       52 16.3       2
## Louisiana        15.4     249       66 22.2       1
## Maine             2.1      83       51  7.8       2
## Maryland         11.3     300       67 27.8       4
## Massachusetts     4.4     149       85 16.3       3
## Michigan         12.1     255       74 35.1       4
## Minnesota         2.7      72       66 14.9       2
## Mississippi      16.1     259       44 17.1       1
## Missouri          9.0     178       70 28.2       4
## Montana           6.0     109       53 16.4       2
## Nebraska          4.3     102       62 16.5       2
## Nevada           12.2     252       81 46.0       4
## New Hampshire     2.1      57       56  9.5       2
## New Jersey        7.4     159       89 18.8       3
## New Mexico       11.4     285       70 32.1       4
## New York         11.1     254       86 26.1       4
## North Carolina   13.0     337       45 16.1       1
## North Dakota      0.8      45       44  7.3       2
## Ohio              7.3     120       75 21.4       3
## Oklahoma          6.6     151       68 20.0       3
## Oregon            4.9     159       67 29.3       3
## Pennsylvania      6.3     106       72 14.9       3
## Rhode Island      3.4     174       87  8.3       3
## South Carolina   14.4     279       48 22.5       1
## South Dakota      3.8      86       45 12.8       2
## Tennessee        13.2     188       59 26.9       1
## Texas            12.7     201       80 25.5       4
## Utah              3.2     120       80 22.9       3
## Vermont           2.2      48       32 11.2       2
## Virginia          8.5     156       63 20.7       3
## Washington        4.0     145       73 26.2       3
## West Virginia     5.7      81       39  9.3       2
## Wisconsin         2.6      53       66 10.8       2
## Wyoming           6.8     161       60 15.6       3
promedio <- aggregate(asignacion, by = list(asignacion$cluster), FUN=mean)
promedio
##   Group.1   Murder   Assault UrbanPop     Rape cluster
## 1       1 13.93750 243.62500 53.75000 21.41250       1
## 2       2  3.60000  78.53846 52.07692 12.17692       2
## 3       3  5.65625 138.87500 73.87500 18.78125       3
## 4       4 10.81538 257.38462 76.00000 33.19231       4
#segmentos$cluster

#estadosclusters <- data.frame(Estados = rownames(asignacion), Cluster = (segmentos$cluster))
#estadosclusters

cluster1 = c(rownames(asignacion[asignacion$cluster == 1,]))
cluster1
## [1] "Alabama"        "Arkansas"       "Georgia"        "Louisiana"     
## [5] "Mississippi"    "North Carolina" "South Carolina" "Tennessee"
cluster2 = c(rownames(asignacion[asignacion$cluster == 2,]))
cluster2
##  [1] "Idaho"         "Iowa"          "Kentucky"      "Maine"        
##  [5] "Minnesota"     "Montana"       "Nebraska"      "New Hampshire"
##  [9] "North Dakota"  "South Dakota"  "Vermont"       "West Virginia"
## [13] "Wisconsin"
cluster3 = c(rownames(asignacion[asignacion$cluster == 3,]))
cluster3
##  [1] "Connecticut"   "Delaware"      "Hawaii"        "Indiana"      
##  [5] "Kansas"        "Massachusetts" "New Jersey"    "Ohio"         
##  [9] "Oklahoma"      "Oregon"        "Pennsylvania"  "Rhode Island" 
## [13] "Utah"          "Virginia"      "Washington"    "Wyoming"
cluster4 = c(rownames(asignacion[asignacion$cluster == 4,]))
cluster4
##  [1] "Alaska"     "Arizona"    "California" "Colorado"   "Florida"   
##  [6] "Illinois"   "Maryland"   "Michigan"   "Missouri"   "Nevada"    
## [11] "New Mexico" "New York"   "Texas"

Paso 6. Graficar los clusters

fviz_cluster(segmentos, data = df)

Paso 7. Encontrar la cantidad de grupos óptima

optimizacion <- clusGap(df, FUN=kmeans, nstart=1, K.max=7)
plot(optimizacion, xlab="Número de Clusters")

Paso 8. Colorear Estados de acuerdo a su cluster

library(maps)

# Plotting maps for different clusters
map(database = "state")
map(database = "state", regions = cluster1, col = "orange", fill = TRUE, add = TRUE)
map(database = "state", regions = cluster2, col = "green", fill = TRUE, add = TRUE)
map(database = "state", regions = cluster3, col = "yellow", fill = TRUE, add = TRUE)
map(database = "state", regions = cluster4, col = "red", fill = TRUE, add = TRUE)

Conclusión

Podemos ver que principalmente los estados del sur del pais estadounidense son los más peligrosos y los estados del norte son los más seguros. Podemos ver que dentro de los cluster #1 es el más peligroso que son los estados de color rojo tienen estadisticas promedio de:

Estados más Peligrosos

  • Homicidio: 10.81538
  • Agresiones/Asaltos: 257.38462
  • Poblacion Urbana: 76.00000
  • Violaciones: 33.19231

Estados más Seguros:

  • Homicidio: 3.60000
  • Agresiones/Asaltos: 78.53846
  • Poblacion Urbana: 52.07692
  • Violaciones: 12.17692

Haciendo una comparación de estos dos segmentos de estados podemos ver que existen diferencias de 7 casi 8 homicidios más. Más de 150 agresiones y suceden más del doble de violaciones en los estados más peligrosos a comparacion de los más seguros de acuerdo a la base de datos.

LS0tCnRpdGxlOiAiQW7DoWxpc2lzIFNlZ3VyaWRhZCBkZSBFc3RhZG9zIGRlIFVTQSIKYXV0aG9yOiAiR2VuYXJvIFJvZHLDrWd1ZXogQWxjw6FudGFyYSAtIEEwMDgzMzE3MiIKZGF0ZTogIjIwMjQtMDItMTkiCm91dHB1dDogCiAgaHRtbF9kb2N1bWVudDoKICAgIHRvYzogVFJVRQogICAgdG9jX2Zsb2F0OiBUUlVFCiAgICBjb2RlX2Rvd25sb2FkOiBUUlVFCi0tLQoKIVtdKC9Vc2Vycy9nZW5hcm9yb2RyaWd1ZXphbGNhbnRhcmEvRGVza3RvcC9UZWMvQUkgLSBDb25jZW50cmFjaW/MgW4vTW/MgWR1bG8gMiAtIE1hY2hpbmUgTGVhcm5pbmcvQkQvQU5EUkVXLVRBVEUtQVJSRVNULVZJREVPLmdpZikKCiMgQ29udGV4dG8KTGEgYmFzZSBkZS4gZGF0b3MgIlVTQXJyZXN0cyIgY29udGllbmUgZXN0YWRpc3RpY2FzIGVuIGFycmVzdG9zIHBvciBjYWRhIDEwMCwwMDAgcmVzdWRlbnRlcyBwb3IgYWdyZXNpb24sIGFzZXNpbmF0byB5IHZpb2xhY2lvbiBlbiBjYWRhIHVubyBkZSBsb3MgNTAgZXN0YWRvcyBkZSBFRS5VVSBlbiAxOTczLgoKRnVlbnRlOiBbVVNBIEFycmVzdHMgRGF0YXNldF0oKQoKIyBQYXNvIDEuIEluc3RhbGFyIHBhcXVldGVzIHkgbGxhbWFyIGxpYnJlcmlhcyAKYGBge3J9CiNpbnN0YWxsLnBhY2thZ2VzKCJtYXBzIikKbGlicmFyeShtYXBzKQojaW5zdGFsbC5wYWNrYWdlcygiY2x1c3RlciIpCmxpYnJhcnkoY2x1c3RlcikKI2luc3RhbGwucGFja2FnZXMoImdncGxvdDIiKQpsaWJyYXJ5KGdncGxvdDIpCiNpbnN0YWxsLnBhY2thZ2VzKCJkYXRhLnRhYmxlIikKbGlicmFyeShkYXRhLnRhYmxlKQojaW5zdGFsbC5wYWNrYWdlcygiZmFjdG9leHRyYSIpCmxpYnJhcnkoZmFjdG9leHRyYSkKYGBgCgojIFBhc28gMi4gQmFzZSBkZSBkYXRvcyB5IEFuYWxpemFyIFZhcmlhYmxlcwpgYGB7cn0KZGF0b3MgPC0gVVNBcnJlc3RzCnN1bW1hcnkoZGF0b3MpCmBgYAoKIyBQYXNvIDMuIEVzY2FsYXIKYGBge3J9CmRmIDwtIHNjYWxlKGRhdG9zKQpgYGAKCiMgUGFzbyA0LiBHZW5lcmFyIGdydXBvcwpgYGB7cn0Kc2V0LnNlZWQoMTIzKQpncnVwb3MgPC0gNApzZWdtZW50b3MgPC0ga21lYW5zKGRmLGdydXBvcykKc2VnbWVudG9zCmBgYAoKIyBQYXNvIDUuIEFzaWduYXIgYSBsb3MgZ3J1cG9zCmBgYHtyfQphc2lnbmFjaW9uIDwtIGNiaW5kKGRhdG9zLCBjbHVzdGVyID0gc2VnbWVudG9zJGNsdXN0ZXIpCmFzaWduYWNpb24KYGBgCgpgYGB7cn0KcHJvbWVkaW8gPC0gYWdncmVnYXRlKGFzaWduYWNpb24sIGJ5ID0gbGlzdChhc2lnbmFjaW9uJGNsdXN0ZXIpLCBGVU49bWVhbikKcHJvbWVkaW8KYGBgCgoKYGBge3J9CiNzZWdtZW50b3MkY2x1c3RlcgoKI2VzdGFkb3NjbHVzdGVycyA8LSBkYXRhLmZyYW1lKEVzdGFkb3MgPSByb3duYW1lcyhhc2lnbmFjaW9uKSwgQ2x1c3RlciA9IChzZWdtZW50b3MkY2x1c3RlcikpCiNlc3RhZG9zY2x1c3RlcnMKCmNsdXN0ZXIxID0gYyhyb3duYW1lcyhhc2lnbmFjaW9uW2FzaWduYWNpb24kY2x1c3RlciA9PSAxLF0pKQpjbHVzdGVyMQpjbHVzdGVyMiA9IGMocm93bmFtZXMoYXNpZ25hY2lvblthc2lnbmFjaW9uJGNsdXN0ZXIgPT0gMixdKSkKY2x1c3RlcjIKY2x1c3RlcjMgPSBjKHJvd25hbWVzKGFzaWduYWNpb25bYXNpZ25hY2lvbiRjbHVzdGVyID09IDMsXSkpCmNsdXN0ZXIzCmNsdXN0ZXI0ID0gYyhyb3duYW1lcyhhc2lnbmFjaW9uW2FzaWduYWNpb24kY2x1c3RlciA9PSA0LF0pKQpjbHVzdGVyNApgYGAKCgojIFBhc28gNi4gR3JhZmljYXIgbG9zIGNsdXN0ZXJzCmBgYHtyfQpmdml6X2NsdXN0ZXIoc2VnbWVudG9zLCBkYXRhID0gZGYpCmBgYAoKIyBQYXNvIDcuIEVuY29udHJhciBsYSBjYW50aWRhZCBkZSBncnVwb3Mgw7NwdGltYQpgYGB7cn0Kb3B0aW1pemFjaW9uIDwtIGNsdXNHYXAoZGYsIEZVTj1rbWVhbnMsIG5zdGFydD0xLCBLLm1heD03KQpwbG90KG9wdGltaXphY2lvbiwgeGxhYj0iTsO6bWVybyBkZSBDbHVzdGVycyIpCmBgYAoKIyBQYXNvIDguIENvbG9yZWFyIEVzdGFkb3MgZGUgYWN1ZXJkbyBhIHN1IGNsdXN0ZXIKYGBge3J9CmxpYnJhcnkobWFwcykKCiMgUGxvdHRpbmcgbWFwcyBmb3IgZGlmZmVyZW50IGNsdXN0ZXJzCm1hcChkYXRhYmFzZSA9ICJzdGF0ZSIpCm1hcChkYXRhYmFzZSA9ICJzdGF0ZSIsIHJlZ2lvbnMgPSBjbHVzdGVyMSwgY29sID0gIm9yYW5nZSIsIGZpbGwgPSBUUlVFLCBhZGQgPSBUUlVFKQptYXAoZGF0YWJhc2UgPSAic3RhdGUiLCByZWdpb25zID0gY2x1c3RlcjIsIGNvbCA9ICJncmVlbiIsIGZpbGwgPSBUUlVFLCBhZGQgPSBUUlVFKQptYXAoZGF0YWJhc2UgPSAic3RhdGUiLCByZWdpb25zID0gY2x1c3RlcjMsIGNvbCA9ICJ5ZWxsb3ciLCBmaWxsID0gVFJVRSwgYWRkID0gVFJVRSkKbWFwKGRhdGFiYXNlID0gInN0YXRlIiwgcmVnaW9ucyA9IGNsdXN0ZXI0LCBjb2wgPSAicmVkIiwgZmlsbCA9IFRSVUUsIGFkZCA9IFRSVUUpCgpgYGAKCiMgQ29uY2x1c2nDs24KUG9kZW1vcyB2ZXIgcXVlIHByaW5jaXBhbG1lbnRlIGxvcyBlc3RhZG9zIGRlbCBzdXIgZGVsIHBhaXMgZXN0YWRvdW5pZGVuc2Ugc29uIGxvcyBtw6FzIHBlbGlncm9zb3MgeSBsb3MgZXN0YWRvcyBkZWwgbm9ydGUgc29uIGxvcyBtw6FzIHNlZ3Vyb3MuIFBvZGVtb3MgdmVyIHF1ZSBkZW50cm8gZGUgbG9zIGNsdXN0ZXIgIzEgZXMgZWwgbcOhcyBwZWxpZ3Jvc28gcXVlIHNvbiBsb3MgZXN0YWRvcyBkZSBjb2xvciByb2pvIHRpZW5lbiBlc3RhZGlzdGljYXMgcHJvbWVkaW8gZGU6IAoKKipFc3RhZG9zIG3DoXMgUGVsaWdyb3NvcyoqCgotIEhvbWljaWRpbzogMTAuODE1MzgKLSBBZ3Jlc2lvbmVzL0FzYWx0b3M6IDI1Ny4zODQ2MgotIFBvYmxhY2lvbiBVcmJhbmE6IDc2LjAwMDAwCi0gVmlvbGFjaW9uZXM6IDMzLjE5MjMxCgoqKkVzdGFkb3MgbcOhcyBTZWd1cm9zOioqCgotIEhvbWljaWRpbzogMy42MDAwMAotIEFncmVzaW9uZXMvQXNhbHRvczogNzguNTM4NDYKLSBQb2JsYWNpb24gVXJiYW5hOiA1Mi4wNzY5MgotIFZpb2xhY2lvbmVzOiAxMi4xNzY5MgoKSGFjaWVuZG8gdW5hIGNvbXBhcmFjacOzbiBkZSBlc3RvcyBkb3Mgc2VnbWVudG9zIGRlIGVzdGFkb3MgcG9kZW1vcyB2ZXIgcXVlIGV4aXN0ZW4gZGlmZXJlbmNpYXMgZGUgNyBjYXNpIDggaG9taWNpZGlvcyBtw6FzLiBNw6FzIGRlIDE1MCBhZ3Jlc2lvbmVzIHkgc3VjZWRlbiBtw6FzIGRlbCBkb2JsZSBkZSB2aW9sYWNpb25lcyBlbiBsb3MgZXN0YWRvcyBtw6FzIHBlbGlncm9zb3MgYSBjb21wYXJhY2lvbiBkZSBsb3MgbcOhcyBzZWd1cm9zIGRlIGFjdWVyZG8gYSBsYSBiYXNlIGRlIGRhdG9zLg==