Contexto

La base de datos USArrests contiene estadísticas en arrestos por cada 100,000 residentes por agresión, asesinato y violación en cada uno de los 50 estados de EE.UU. En 1973.

Instalar paquetesy llamar librerias

library(cluster)
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(rnaturalearth)
library(rnaturalearthdata)
## 
## Attaching package: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
## 
##     countries110
library(devtools)
## Loading required package: usethis
library(caret)
## Loading required package: lattice
library(datasets)
library(ggplot2)
library(lattice)
library(DataExplorer)

Importar base de datos

df <- USArrests
summary(df)
##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

Clusters

# Scale the data
scaled_df <- scale(df)
# Generate the clusters
n_clusters = 4
kmean <- kmeans(scaled_df, centers = n_clusters)
# Assign the labels
labels <- cbind(df, cluster = kmean$cluster)
# Plot the clusters
fviz_cluster(kmean, data = df)

# Paso 7. Optimizar la cantidad de grupos La cantidad optima de grupos corresponde al punto más alto de la siguiente gráfica

# Optimize the clusters
set.seed(123)
optimal <- clusGap(scaled_df, FUN = kmeans, nstart = 1, K.max = 10)
plot(optimal, xlab = 'Number of clusters')

# El K óptimo es el coeficiente de silueta máximo.
fviz_nbclust(df, kmeans, method = "wss") +
  ggtitle("Método del Codo")

# E1 k óptimo es el coeficiente de silueta del punto de inflexión.
# Compare the clusters
cluster_mean <- aggregate(labels, by = list(labels$cluster), FUN = mean)
cluster_mean
##   Group.1   Murder   Assault UrbanPop     Rape cluster
## 1       1 13.93750 243.62500 53.75000 21.41250       1
## 2       2 10.81538 257.38462 76.00000 33.19231       2
## 3       3  3.60000  78.53846 52.07692 12.17692       3
## 4       4  5.65625 138.87500 73.87500 18.78125       4
table(labels$cluster)
## 
##  1  2  3  4 
##  8 13 13 16
new_cluster_names <- c("Safe State", "High Crime State", "Dangerous State", "Low Crime State")

labels <- labels %>%
  mutate(cluster_name = case_when(
    cluster == 1 ~ new_cluster_names[1],
    cluster == 2 ~ new_cluster_names[2],
    cluster == 3 ~ new_cluster_names[3],
    cluster == 4 ~ new_cluster_names[4]
  ))
print(labels)
##                Murder Assault UrbanPop Rape cluster     cluster_name
## Alabama          13.2     236       58 21.2       1       Safe State
## Alaska           10.0     263       48 44.5       2 High Crime State
## Arizona           8.1     294       80 31.0       2 High Crime State
## Arkansas          8.8     190       50 19.5       1       Safe State
## California        9.0     276       91 40.6       2 High Crime State
## Colorado          7.9     204       78 38.7       2 High Crime State
## Connecticut       3.3     110       77 11.1       4  Low Crime State
## Delaware          5.9     238       72 15.8       4  Low Crime State
## Florida          15.4     335       80 31.9       2 High Crime State
## Georgia          17.4     211       60 25.8       1       Safe State
## Hawaii            5.3      46       83 20.2       4  Low Crime State
## Idaho             2.6     120       54 14.2       3  Dangerous State
## Illinois         10.4     249       83 24.0       2 High Crime State
## Indiana           7.2     113       65 21.0       4  Low Crime State
## Iowa              2.2      56       57 11.3       3  Dangerous State
## Kansas            6.0     115       66 18.0       4  Low Crime State
## Kentucky          9.7     109       52 16.3       3  Dangerous State
## Louisiana        15.4     249       66 22.2       1       Safe State
## Maine             2.1      83       51  7.8       3  Dangerous State
## Maryland         11.3     300       67 27.8       2 High Crime State
## Massachusetts     4.4     149       85 16.3       4  Low Crime State
## Michigan         12.1     255       74 35.1       2 High Crime State
## Minnesota         2.7      72       66 14.9       3  Dangerous State
## Mississippi      16.1     259       44 17.1       1       Safe State
## Missouri          9.0     178       70 28.2       2 High Crime State
## Montana           6.0     109       53 16.4       3  Dangerous State
## Nebraska          4.3     102       62 16.5       3  Dangerous State
## Nevada           12.2     252       81 46.0       2 High Crime State
## New Hampshire     2.1      57       56  9.5       3  Dangerous State
## New Jersey        7.4     159       89 18.8       4  Low Crime State
## New Mexico       11.4     285       70 32.1       2 High Crime State
## New York         11.1     254       86 26.1       2 High Crime State
## North Carolina   13.0     337       45 16.1       1       Safe State
## North Dakota      0.8      45       44  7.3       3  Dangerous State
## Ohio              7.3     120       75 21.4       4  Low Crime State
## Oklahoma          6.6     151       68 20.0       4  Low Crime State
## Oregon            4.9     159       67 29.3       4  Low Crime State
## Pennsylvania      6.3     106       72 14.9       4  Low Crime State
## Rhode Island      3.4     174       87  8.3       4  Low Crime State
## South Carolina   14.4     279       48 22.5       1       Safe State
## South Dakota      3.8      86       45 12.8       3  Dangerous State
## Tennessee        13.2     188       59 26.9       1       Safe State
## Texas            12.7     201       80 25.5       2 High Crime State
## Utah              3.2     120       80 22.9       4  Low Crime State
## Vermont           2.2      48       32 11.2       3  Dangerous State
## Virginia          8.5     156       63 20.7       4  Low Crime State
## Washington        4.0     145       73 26.2       4  Low Crime State
## West Virginia     5.7      81       39  9.3       3  Dangerous State
## Wisconsin         2.6      53       66 10.8       3  Dangerous State
## Wyoming           6.8     161       60 15.6       4  Low Crime State

Modelo de Random Forest

labels <- labels %>%
  mutate(cluster_name = as.factor(cluster_name)) %>%
  select(-cluster)
print(labels)
##                Murder Assault UrbanPop Rape     cluster_name
## Alabama          13.2     236       58 21.2       Safe State
## Alaska           10.0     263       48 44.5 High Crime State
## Arizona           8.1     294       80 31.0 High Crime State
## Arkansas          8.8     190       50 19.5       Safe State
## California        9.0     276       91 40.6 High Crime State
## Colorado          7.9     204       78 38.7 High Crime State
## Connecticut       3.3     110       77 11.1  Low Crime State
## Delaware          5.9     238       72 15.8  Low Crime State
## Florida          15.4     335       80 31.9 High Crime State
## Georgia          17.4     211       60 25.8       Safe State
## Hawaii            5.3      46       83 20.2  Low Crime State
## Idaho             2.6     120       54 14.2  Dangerous State
## Illinois         10.4     249       83 24.0 High Crime State
## Indiana           7.2     113       65 21.0  Low Crime State
## Iowa              2.2      56       57 11.3  Dangerous State
## Kansas            6.0     115       66 18.0  Low Crime State
## Kentucky          9.7     109       52 16.3  Dangerous State
## Louisiana        15.4     249       66 22.2       Safe State
## Maine             2.1      83       51  7.8  Dangerous State
## Maryland         11.3     300       67 27.8 High Crime State
## Massachusetts     4.4     149       85 16.3  Low Crime State
## Michigan         12.1     255       74 35.1 High Crime State
## Minnesota         2.7      72       66 14.9  Dangerous State
## Mississippi      16.1     259       44 17.1       Safe State
## Missouri          9.0     178       70 28.2 High Crime State
## Montana           6.0     109       53 16.4  Dangerous State
## Nebraska          4.3     102       62 16.5  Dangerous State
## Nevada           12.2     252       81 46.0 High Crime State
## New Hampshire     2.1      57       56  9.5  Dangerous State
## New Jersey        7.4     159       89 18.8  Low Crime State
## New Mexico       11.4     285       70 32.1 High Crime State
## New York         11.1     254       86 26.1 High Crime State
## North Carolina   13.0     337       45 16.1       Safe State
## North Dakota      0.8      45       44  7.3  Dangerous State
## Ohio              7.3     120       75 21.4  Low Crime State
## Oklahoma          6.6     151       68 20.0  Low Crime State
## Oregon            4.9     159       67 29.3  Low Crime State
## Pennsylvania      6.3     106       72 14.9  Low Crime State
## Rhode Island      3.4     174       87  8.3  Low Crime State
## South Carolina   14.4     279       48 22.5       Safe State
## South Dakota      3.8      86       45 12.8  Dangerous State
## Tennessee        13.2     188       59 26.9       Safe State
## Texas            12.7     201       80 25.5 High Crime State
## Utah              3.2     120       80 22.9  Low Crime State
## Vermont           2.2      48       32 11.2  Dangerous State
## Virginia          8.5     156       63 20.7  Low Crime State
## Washington        4.0     145       73 26.2  Low Crime State
## West Virginia     5.7      81       39  9.3  Dangerous State
## Wisconsin         2.6      53       66 10.8  Dangerous State
## Wyoming           6.8     161       60 15.6  Low Crime State
# split the data 
set.seed(123)
training <- createDataPartition(labels$cluster_name, p = 0.8, list = FALSE)
training_data <- labels[training, ]
test_data <- labels[-training, ]
model <- train(
  cluster_name ~ ., 
  data = training_data,
  method = 'rf',
  preProcess = c('scale', 'center'),
  trControl = trainControl(method = 'cv', number = 10),
  tuneGrid = expand.grid(mtry = c(2, 4, 6))
)
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
training_results <- predict(model, training_data)
test_results <- predict(model, test_data)

# Confusion matrix for training data
cmtr <- confusionMatrix(training_results, training_data$cluster_name)
print(cmtr)
## Confusion Matrix and Statistics
## 
##                   Reference
## Prediction         Dangerous State High Crime State Low Crime State Safe State
##   Dangerous State               11                0               0          0
##   High Crime State               0               11               0          0
##   Low Crime State                0                0              13          0
##   Safe State                     0                0               0          7
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9159, 1)
##     No Information Rate : 0.3095     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: Dangerous State Class: High Crime State
## Sensitivity                          1.0000                  1.0000
## Specificity                          1.0000                  1.0000
## Pos Pred Value                       1.0000                  1.0000
## Neg Pred Value                       1.0000                  1.0000
## Prevalence                           0.2619                  0.2619
## Detection Rate                       0.2619                  0.2619
## Detection Prevalence                 0.2619                  0.2619
## Balanced Accuracy                    1.0000                  1.0000
##                      Class: Low Crime State Class: Safe State
## Sensitivity                          1.0000            1.0000
## Specificity                          1.0000            1.0000
## Pos Pred Value                       1.0000            1.0000
## Neg Pred Value                       1.0000            1.0000
## Prevalence                           0.3095            0.1667
## Detection Rate                       0.3095            0.1667
## Detection Prevalence                 0.3095            0.1667
## Balanced Accuracy                    1.0000            1.0000
# Confusion matrix for testing data
cmts <- confusionMatrix(test_results, test_data$cluster_name)
print(cmts)
## Confusion Matrix and Statistics
## 
##                   Reference
## Prediction         Dangerous State High Crime State Low Crime State Safe State
##   Dangerous State                1                0               0          0
##   High Crime State               0                2               0          0
##   Low Crime State                1                0               3          0
##   Safe State                     0                0               0          1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.875           
##                  95% CI : (0.4735, 0.9968)
##     No Information Rate : 0.375           
##     P-Value [Acc > NIR] : 0.005605        
##                                           
##                   Kappa : 0.8222          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Dangerous State Class: High Crime State
## Sensitivity                          0.5000                    1.00
## Specificity                          1.0000                    1.00
## Pos Pred Value                       1.0000                    1.00
## Neg Pred Value                       0.8571                    1.00
## Prevalence                           0.2500                    0.25
## Detection Rate                       0.1250                    0.25
## Detection Prevalence                 0.1250                    0.25
## Balanced Accuracy                    0.7500                    1.00
##                      Class: Low Crime State Class: Safe State
## Sensitivity                           1.000             1.000
## Specificity                           0.800             1.000
## Pos Pred Value                        0.750             1.000
## Neg Pred Value                        1.000             1.000
## Prevalence                            0.375             0.125
## Detection Rate                        0.375             0.125
## Detection Prevalence                  0.500             0.125
## Balanced Accuracy                     0.900             1.000
LS0tCnRpdGxlOiAiVVNBcnJlc3RzIgphdXRob3I6ICJFcm5lc3RvIEd1ZW5kdWxhaW4gQTAwODM3NjgwIgpkYXRlOiAiMjAyNS0wMi0yMSIKb3V0cHV0OiAKICBodG1sX2RvY3VtZW50OgogICAgICB0b2M6IFRSVUUKICAgICAgdG9jX2Zsb2F0OiBUUlVFCiAgICAgIGNvZGVfZG93bmxvYWQ6IFRSVUUKICAgICAgdGhlbWU6ICdqb3VybmFsJwotLS0KCiFbXShDOlxcVXNlcnNcXEFDRVJcXERvd25sb2Fkc1xcT0lQICgxKS5qcGcpCgojIDxzcGFuIHN0eWxlPSJjb2xvcjogcHVycGxlOyI+Q29udGV4dG88L3NwYW4+CgoKTGEgYmFzZSBkZSBkYXRvcyAqKlVTQXJyZXN0cyoqIGNvbnRpZW5lIGVzdGFkw61zdGljYXMgZW4gYXJyZXN0b3MgcG9yIGNhZGEgMTAwLDAwMCByZXNpZGVudGVzIHBvciBhZ3Jlc2nDs24sIGFzZXNpbmF0byB5IHZpb2xhY2nDs24gZW4gY2FkYSB1bm8gZGUgbG9zIDUwIGVzdGFkb3MgZGUgRUUuVVUuIEVuIDE5NzMuICAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBwdXJwbGU7Ij5JbnN0YWxhciBwYXF1ZXRlc3kgbGxhbWFyIGxpYnJlcmlhczwvc3Bhbj4KCmBgYHtyfQpsaWJyYXJ5KGNsdXN0ZXIpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeShmYWN0b2V4dHJhKQpsaWJyYXJ5KGRhdGEudGFibGUpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoc2YpCmxpYnJhcnkocm5hdHVyYWxlYXJ0aCkKbGlicmFyeShybmF0dXJhbGVhcnRoZGF0YSkKbGlicmFyeShkZXZ0b29scykKbGlicmFyeShjYXJldCkKbGlicmFyeShkYXRhc2V0cykKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KGxhdHRpY2UpCmxpYnJhcnkoRGF0YUV4cGxvcmVyKQpgYGAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBwdXJwbGU7Ij5JbXBvcnRhciBiYXNlIGRlIGRhdG9zPC9zcGFuPgoKYGBge3J9CmRmIDwtIFVTQXJyZXN0cwpzdW1tYXJ5KGRmKQpgYGAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBwdXJwbGU7Ij5DbHVzdGVyczwvc3Bhbj4KCmBgYHtyfQojIFNjYWxlIHRoZSBkYXRhCnNjYWxlZF9kZiA8LSBzY2FsZShkZikKIyBHZW5lcmF0ZSB0aGUgY2x1c3RlcnMKbl9jbHVzdGVycyA9IDQKa21lYW4gPC0ga21lYW5zKHNjYWxlZF9kZiwgY2VudGVycyA9IG5fY2x1c3RlcnMpCiMgQXNzaWduIHRoZSBsYWJlbHMKbGFiZWxzIDwtIGNiaW5kKGRmLCBjbHVzdGVyID0ga21lYW4kY2x1c3RlcikKIyBQbG90IHRoZSBjbHVzdGVycwpmdml6X2NsdXN0ZXIoa21lYW4sIGRhdGEgPSBkZikKYGBgCiMgUGFzbyA3LiBPcHRpbWl6YXIgbGEgY2FudGlkYWQgZGUgZ3J1cG9zCkxhIGNhbnRpZGFkIG9wdGltYSBkZSBncnVwb3MgY29ycmVzcG9uZGUgYWwgcHVudG8gbcOhcyBhbHRvIGRlIGxhIHNpZ3VpZW50ZSBncsOhZmljYSAKYGBge3J9CiMgT3B0aW1pemUgdGhlIGNsdXN0ZXJzCnNldC5zZWVkKDEyMykKb3B0aW1hbCA8LSBjbHVzR2FwKHNjYWxlZF9kZiwgRlVOID0ga21lYW5zLCBuc3RhcnQgPSAxLCBLLm1heCA9IDEwKQpwbG90KG9wdGltYWwsIHhsYWIgPSAnTnVtYmVyIG9mIGNsdXN0ZXJzJykKIyBFbCBLIMOzcHRpbW8gZXMgZWwgY29lZmljaWVudGUgZGUgc2lsdWV0YSBtw6F4aW1vLgpmdml6X25iY2x1c3QoZGYsIGttZWFucywgbWV0aG9kID0gIndzcyIpICsKICBnZ3RpdGxlKCJNw6l0b2RvIGRlbCBDb2RvIikKIyBFMSBrIMOzcHRpbW8gZXMgZWwgY29lZmljaWVudGUgZGUgc2lsdWV0YSBkZWwgcHVudG8gZGUgaW5mbGV4acOzbi4KYGBgCgpgYGB7cn0KIyBDb21wYXJlIHRoZSBjbHVzdGVycwpjbHVzdGVyX21lYW4gPC0gYWdncmVnYXRlKGxhYmVscywgYnkgPSBsaXN0KGxhYmVscyRjbHVzdGVyKSwgRlVOID0gbWVhbikKY2x1c3Rlcl9tZWFuCnRhYmxlKGxhYmVscyRjbHVzdGVyKQpgYGAKCmBgYHtyfQpuZXdfY2x1c3Rlcl9uYW1lcyA8LSBjKCJTYWZlIFN0YXRlIiwgIkhpZ2ggQ3JpbWUgU3RhdGUiLCAiRGFuZ2Vyb3VzIFN0YXRlIiwgIkxvdyBDcmltZSBTdGF0ZSIpCgpsYWJlbHMgPC0gbGFiZWxzICU+JQogIG11dGF0ZShjbHVzdGVyX25hbWUgPSBjYXNlX3doZW4oCiAgICBjbHVzdGVyID09IDEgfiBuZXdfY2x1c3Rlcl9uYW1lc1sxXSwKICAgIGNsdXN0ZXIgPT0gMiB+IG5ld19jbHVzdGVyX25hbWVzWzJdLAogICAgY2x1c3RlciA9PSAzIH4gbmV3X2NsdXN0ZXJfbmFtZXNbM10sCiAgICBjbHVzdGVyID09IDQgfiBuZXdfY2x1c3Rlcl9uYW1lc1s0XQogICkpCnByaW50KGxhYmVscykKCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6IHB1cnBsZTsiPk1vZGVsbyBkZSBSYW5kb20gRm9yZXN0PC9zcGFuPgoKYGBge3J9CmxhYmVscyA8LSBsYWJlbHMgJT4lCiAgbXV0YXRlKGNsdXN0ZXJfbmFtZSA9IGFzLmZhY3RvcihjbHVzdGVyX25hbWUpKSAlPiUKICBzZWxlY3QoLWNsdXN0ZXIpCnByaW50KGxhYmVscykKYGBgCgoKYGBge3J9CiMgc3BsaXQgdGhlIGRhdGEgCnNldC5zZWVkKDEyMykKdHJhaW5pbmcgPC0gY3JlYXRlRGF0YVBhcnRpdGlvbihsYWJlbHMkY2x1c3Rlcl9uYW1lLCBwID0gMC44LCBsaXN0ID0gRkFMU0UpCnRyYWluaW5nX2RhdGEgPC0gbGFiZWxzW3RyYWluaW5nLCBdCnRlc3RfZGF0YSA8LSBsYWJlbHNbLXRyYWluaW5nLCBdCmBgYAoKYGBge3J9Cm1vZGVsIDwtIHRyYWluKAogIGNsdXN0ZXJfbmFtZSB+IC4sIAogIGRhdGEgPSB0cmFpbmluZ19kYXRhLAogIG1ldGhvZCA9ICdyZicsCiAgcHJlUHJvY2VzcyA9IGMoJ3NjYWxlJywgJ2NlbnRlcicpLAogIHRyQ29udHJvbCA9IHRyYWluQ29udHJvbChtZXRob2QgPSAnY3YnLCBudW1iZXIgPSAxMCksCiAgdHVuZUdyaWQgPSBleHBhbmQuZ3JpZChtdHJ5ID0gYygyLCA0LCA2KSkKKQoKdHJhaW5pbmdfcmVzdWx0cyA8LSBwcmVkaWN0KG1vZGVsLCB0cmFpbmluZ19kYXRhKQp0ZXN0X3Jlc3VsdHMgPC0gcHJlZGljdChtb2RlbCwgdGVzdF9kYXRhKQoKIyBDb25mdXNpb24gbWF0cml4IGZvciB0cmFpbmluZyBkYXRhCmNtdHIgPC0gY29uZnVzaW9uTWF0cml4KHRyYWluaW5nX3Jlc3VsdHMsIHRyYWluaW5nX2RhdGEkY2x1c3Rlcl9uYW1lKQpwcmludChjbXRyKQoKIyBDb25mdXNpb24gbWF0cml4IGZvciB0ZXN0aW5nIGRhdGEKY210cyA8LSBjb25mdXNpb25NYXRyaXgodGVzdF9yZXN1bHRzLCB0ZXN0X2RhdGEkY2x1c3Rlcl9uYW1lKQpwcmludChjbXRzKQpgYGAKCg==