La base de. datos “USArrests” contiene estadisticas en arrestos por cada 100,000 resudentes por agresion, asesinato y violacion en cada uno de los 50 estados de EE.UU en 1973.
Fuente: USA Arrests Dataset
#install.packages("maps")
library(maps)
## Warning: package 'maps' was built under R version 4.3.1
#install.packages("cluster")
library(cluster)
##
## Attaching package: 'cluster'
## The following object is masked from 'package:maps':
##
## votes.repub
#install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
#install.packages("data.table")
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.1
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
datos <- USArrests
summary(datos)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
df <- scale(datos)
set.seed(123)
grupos <- 4
segmentos <- kmeans(df,grupos)
segmentos
## K-means clustering with 4 clusters of sizes 8, 13, 16, 13
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 1.4118898 0.8743346 -0.8145211 0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 4 0.6950701 1.0394414 0.7226370 1.27693964
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 4 4 1 4
## Colorado Connecticut Delaware Florida Georgia
## 4 3 3 4 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 2 4 3 2
## Kansas Kentucky Louisiana Maine Maryland
## 3 2 1 2 4
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 4 2 1 4
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 4 2 3
## New Mexico New York North Carolina North Dakota Ohio
## 4 4 1 2 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 4 3 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 8.316061 11.952463 16.212213 19.922437
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
asignacion <- cbind(datos, cluster = segmentos$cluster)
asignacion
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 1
## Alaska 10.0 263 48 44.5 4
## Arizona 8.1 294 80 31.0 4
## Arkansas 8.8 190 50 19.5 1
## California 9.0 276 91 40.6 4
## Colorado 7.9 204 78 38.7 4
## Connecticut 3.3 110 77 11.1 3
## Delaware 5.9 238 72 15.8 3
## Florida 15.4 335 80 31.9 4
## Georgia 17.4 211 60 25.8 1
## Hawaii 5.3 46 83 20.2 3
## Idaho 2.6 120 54 14.2 2
## Illinois 10.4 249 83 24.0 4
## Indiana 7.2 113 65 21.0 3
## Iowa 2.2 56 57 11.3 2
## Kansas 6.0 115 66 18.0 3
## Kentucky 9.7 109 52 16.3 2
## Louisiana 15.4 249 66 22.2 1
## Maine 2.1 83 51 7.8 2
## Maryland 11.3 300 67 27.8 4
## Massachusetts 4.4 149 85 16.3 3
## Michigan 12.1 255 74 35.1 4
## Minnesota 2.7 72 66 14.9 2
## Mississippi 16.1 259 44 17.1 1
## Missouri 9.0 178 70 28.2 4
## Montana 6.0 109 53 16.4 2
## Nebraska 4.3 102 62 16.5 2
## Nevada 12.2 252 81 46.0 4
## New Hampshire 2.1 57 56 9.5 2
## New Jersey 7.4 159 89 18.8 3
## New Mexico 11.4 285 70 32.1 4
## New York 11.1 254 86 26.1 4
## North Carolina 13.0 337 45 16.1 1
## North Dakota 0.8 45 44 7.3 2
## Ohio 7.3 120 75 21.4 3
## Oklahoma 6.6 151 68 20.0 3
## Oregon 4.9 159 67 29.3 3
## Pennsylvania 6.3 106 72 14.9 3
## Rhode Island 3.4 174 87 8.3 3
## South Carolina 14.4 279 48 22.5 1
## South Dakota 3.8 86 45 12.8 2
## Tennessee 13.2 188 59 26.9 1
## Texas 12.7 201 80 25.5 4
## Utah 3.2 120 80 22.9 3
## Vermont 2.2 48 32 11.2 2
## Virginia 8.5 156 63 20.7 3
## Washington 4.0 145 73 26.2 3
## West Virginia 5.7 81 39 9.3 2
## Wisconsin 2.6 53 66 10.8 2
## Wyoming 6.8 161 60 15.6 3
promedio <- aggregate(asignacion, by = list(asignacion$cluster), FUN=mean)
promedio
## Group.1 Murder Assault UrbanPop Rape cluster
## 1 1 13.93750 243.62500 53.75000 21.41250 1
## 2 2 3.60000 78.53846 52.07692 12.17692 2
## 3 3 5.65625 138.87500 73.87500 18.78125 3
## 4 4 10.81538 257.38462 76.00000 33.19231 4
#segmentos$cluster
#estadosclusters <- data.frame(Estados = rownames(asignacion), Cluster = (segmentos$cluster))
#estadosclusters
cluster1 = c(rownames(asignacion[asignacion$cluster == 1,]))
cluster1
## [1] "Alabama" "Arkansas" "Georgia" "Louisiana"
## [5] "Mississippi" "North Carolina" "South Carolina" "Tennessee"
cluster2 = c(rownames(asignacion[asignacion$cluster == 2,]))
cluster2
## [1] "Idaho" "Iowa" "Kentucky" "Maine"
## [5] "Minnesota" "Montana" "Nebraska" "New Hampshire"
## [9] "North Dakota" "South Dakota" "Vermont" "West Virginia"
## [13] "Wisconsin"
cluster3 = c(rownames(asignacion[asignacion$cluster == 3,]))
cluster3
## [1] "Connecticut" "Delaware" "Hawaii" "Indiana"
## [5] "Kansas" "Massachusetts" "New Jersey" "Ohio"
## [9] "Oklahoma" "Oregon" "Pennsylvania" "Rhode Island"
## [13] "Utah" "Virginia" "Washington" "Wyoming"
cluster4 = c(rownames(asignacion[asignacion$cluster == 4,]))
cluster4
## [1] "Alaska" "Arizona" "California" "Colorado" "Florida"
## [6] "Illinois" "Maryland" "Michigan" "Missouri" "Nevada"
## [11] "New Mexico" "New York" "Texas"
fviz_cluster(segmentos, data = df)
optimizacion <- clusGap(df, FUN=kmeans, nstart=1, K.max=7)
plot(optimizacion, xlab="Número de Clusters")
library(maps)
# Plotting maps for different clusters
map(database = "state")
map(database = "state", regions = cluster1, col = "orange", fill = TRUE, add = TRUE)
map(database = "state", regions = cluster2, col = "green", fill = TRUE, add = TRUE)
map(database = "state", regions = cluster3, col = "yellow", fill = TRUE, add = TRUE)
map(database = "state", regions = cluster4, col = "red", fill = TRUE, add = TRUE)
Podemos ver que principalmente los estados del sur del pais estadounidense son los más peligrosos y los estados del norte son los más seguros. Podemos ver que dentro de los cluster #1 es el más peligroso que son los estados de color rojo tienen estadisticas promedio de:
Estados más Peligrosos
Estados más Seguros:
Haciendo una comparación de estos dos segmentos de estados podemos ver que existen diferencias de 7 casi 8 homicidios más. Más de 150 agresiones y suceden más del doble de violaciones en los estados más peligrosos a comparacion de los más seguros de acuerdo a la base de datos.