La base de datos “USArrest” contiene estadisticas en arrestos por cada 100,000 residentes por agresión, asesinato y violacion en cada uno de los 50 estados de EE. UU. en 1973.
#install.packages("cluster")
library(cluster)
#install.packages("ggplot2")
library(ggplot2)
#install.packages("data.table")
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.2
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("maps")
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:cluster':
##
## votes.repub
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- USArrests
summary(df)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
dfscaled <- scale(df)
set.seed(123)
opt <- clusGap(dfscaled, FUN = kmeans, nstart=1, K.max=8)
plot(opt, xlab="Numero de clusters k")
grupo <- 4
seg <- kmeans(df, grupo)
seg
## K-means clustering with 4 clusters of sizes 18, 12, 13, 7
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 4.161111 83.94444 57.77778 13.52778
## 2 12.033333 239.25000 69.16667 28.50000
## 3 6.730769 157.69231 71.23077 21.08462
## 4 11.800000 300.85714 68.71429 28.85714
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 2 2 4 3 4
## Colorado Connecticut Delaware Florida Georgia
## 2 1 2 4 2
## Hawaii Idaho Illinois Indiana Iowa
## 1 1 2 1 1
## Kansas Kentucky Louisiana Maine Maryland
## 1 1 2 1 4
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 1 2 3
## Montana Nebraska Nevada New Hampshire New Jersey
## 1 1 2 1 3
## New Mexico New York North Carolina North Dakota Ohio
## 4 2 4 1 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 1 3 4
## South Dakota Tennessee Texas Utah Vermont
## 1 3 2 3 1
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 1 1 3
##
## Within cluster sum of squares by cluster:
## [1] 16065.634 8687.183 7955.062 6023.923
## (between_SS / total_SS = 89.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
clusfila <- cbind(df, cluster = seg$cluster)
clusfila
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 2
## Alaska 10.0 263 48 44.5 2
## Arizona 8.1 294 80 31.0 4
## Arkansas 8.8 190 50 19.5 3
## California 9.0 276 91 40.6 4
## Colorado 7.9 204 78 38.7 2
## Connecticut 3.3 110 77 11.1 1
## Delaware 5.9 238 72 15.8 2
## Florida 15.4 335 80 31.9 4
## Georgia 17.4 211 60 25.8 2
## Hawaii 5.3 46 83 20.2 1
## Idaho 2.6 120 54 14.2 1
## Illinois 10.4 249 83 24.0 2
## Indiana 7.2 113 65 21.0 1
## Iowa 2.2 56 57 11.3 1
## Kansas 6.0 115 66 18.0 1
## Kentucky 9.7 109 52 16.3 1
## Louisiana 15.4 249 66 22.2 2
## Maine 2.1 83 51 7.8 1
## Maryland 11.3 300 67 27.8 4
## Massachusetts 4.4 149 85 16.3 3
## Michigan 12.1 255 74 35.1 2
## Minnesota 2.7 72 66 14.9 1
## Mississippi 16.1 259 44 17.1 2
## Missouri 9.0 178 70 28.2 3
## Montana 6.0 109 53 16.4 1
## Nebraska 4.3 102 62 16.5 1
## Nevada 12.2 252 81 46.0 2
## New Hampshire 2.1 57 56 9.5 1
## New Jersey 7.4 159 89 18.8 3
## New Mexico 11.4 285 70 32.1 4
## New York 11.1 254 86 26.1 2
## North Carolina 13.0 337 45 16.1 4
## North Dakota 0.8 45 44 7.3 1
## Ohio 7.3 120 75 21.4 3
## Oklahoma 6.6 151 68 20.0 3
## Oregon 4.9 159 67 29.3 3
## Pennsylvania 6.3 106 72 14.9 1
## Rhode Island 3.4 174 87 8.3 3
## South Carolina 14.4 279 48 22.5 4
## South Dakota 3.8 86 45 12.8 1
## Tennessee 13.2 188 59 26.9 3
## Texas 12.7 201 80 25.5 2
## Utah 3.2 120 80 22.9 3
## Vermont 2.2 48 32 11.2 1
## Virginia 8.5 156 63 20.7 3
## Washington 4.0 145 73 26.2 3
## West Virginia 5.7 81 39 9.3 1
## Wisconsin 2.6 53 66 10.8 1
## Wyoming 6.8 161 60 15.6 3
fviz_cluster(seg, data = df)
clusfila <- as.data.frame(clusfila)
cluster1 <- filter(clusfila, cluster == 1)
cluster2 <- filter(clusfila, cluster == 2)
cluster3 <- filter(clusfila, cluster == 3)
cluster4 <- filter(clusfila, cluster == 4)
mean_assault1 <- mean(cluster1$Assault)
mean_assault2 <- mean(cluster2$Assault)
mean_assault3 <- mean(cluster3$Assault)
mean_assault4 <- mean(cluster4$Assault)
mean_assault4
## [1] 300.8571
mean_assault2
## [1] 239.25
mean_assault3
## [1] 157.6923
mean_assault1
## [1] 83.94444
rojo <- filter(clusfila, cluster == 4)
naranja <- filter(clusfila, cluster == 2)
amarillo <- filter(clusfila, cluster == 3)
verde <- filter(clusfila, cluster == 1)
rojo$State <- rownames(rojo)
naranja$State <- rownames(naranja)
amarillo$State <- rownames(amarillo)
verde$State <- rownames(verde)
map(database = "state")
rojo_mapa <- rojo$State
map(database = "state", regions = rojo_mapa, col = "red", fill = T, add = TRUE)
naranja_mapa <- naranja$State
map(database = "state", regions = naranja_mapa, col = "orange", fill = T, add = TRUE)
amarillo_mapa <- amarillo$State
map(database = "state", regions = amarillo_mapa, col = "yellow", fill = T, add = TRUE)
verde_mapa <- verde$State
map(database = "state", regions = verde_mapa, col = "green", fill = T, add = TRUE)
legend("topright", legend = c("Más Seguro", "Más Inseguro", "Seguro", "Inseguro"),
fill = c("green", "red", "yellow", "orange"))