.jpg)
La base de datos USArrests contiene estadísticas en arrestos por cada 100,000 residentes por agresión, asesinato y violación en cada uno de los 50 estados de EE.UU. En 1973.
library(cluster)
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(rnaturalearth)
library(rnaturalearthdata)
##
## Attaching package: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
##
## countries110
library(devtools)
## Loading required package: usethis
library(caret)
## Loading required package: lattice
library(datasets)
library(ggplot2)
library(lattice)
library(DataExplorer)
df <- USArrests
summary(df)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
# Scale the data
scaled_df <- scale(df)
# Generate the clusters
n_clusters = 4
kmean <- kmeans(scaled_df, centers = n_clusters)
# Assign the labels
labels <- cbind(df, cluster = kmean$cluster)
# Plot the clusters
fviz_cluster(kmean, data = df)
# Paso 7. Optimizar la cantidad de grupos La cantidad optima de grupos
corresponde al punto más alto de la siguiente gráfica
# Optimize the clusters
set.seed(123)
optimal <- clusGap(scaled_df, FUN = kmeans, nstart = 1, K.max = 10)
plot(optimal, xlab = 'Number of clusters')
# El K óptimo es el coeficiente de silueta máximo.
fviz_nbclust(df, kmeans, method = "wss") +
ggtitle("Método del Codo")
# E1 k óptimo es el coeficiente de silueta del punto de inflexión.
# Compare the clusters
cluster_mean <- aggregate(labels, by = list(labels$cluster), FUN = mean)
cluster_mean
## Group.1 Murder Assault UrbanPop Rape cluster
## 1 1 13.93750 243.62500 53.75000 21.41250 1
## 2 2 10.81538 257.38462 76.00000 33.19231 2
## 3 3 3.60000 78.53846 52.07692 12.17692 3
## 4 4 5.65625 138.87500 73.87500 18.78125 4
table(labels$cluster)
##
## 1 2 3 4
## 8 13 13 16
new_cluster_names <- c("Safe State", "High Crime State", "Dangerous State", "Low Crime State")
labels <- labels %>%
mutate(cluster_name = case_when(
cluster == 1 ~ new_cluster_names[1],
cluster == 2 ~ new_cluster_names[2],
cluster == 3 ~ new_cluster_names[3],
cluster == 4 ~ new_cluster_names[4]
))
print(labels)
## Murder Assault UrbanPop Rape cluster cluster_name
## Alabama 13.2 236 58 21.2 1 Safe State
## Alaska 10.0 263 48 44.5 2 High Crime State
## Arizona 8.1 294 80 31.0 2 High Crime State
## Arkansas 8.8 190 50 19.5 1 Safe State
## California 9.0 276 91 40.6 2 High Crime State
## Colorado 7.9 204 78 38.7 2 High Crime State
## Connecticut 3.3 110 77 11.1 4 Low Crime State
## Delaware 5.9 238 72 15.8 4 Low Crime State
## Florida 15.4 335 80 31.9 2 High Crime State
## Georgia 17.4 211 60 25.8 1 Safe State
## Hawaii 5.3 46 83 20.2 4 Low Crime State
## Idaho 2.6 120 54 14.2 3 Dangerous State
## Illinois 10.4 249 83 24.0 2 High Crime State
## Indiana 7.2 113 65 21.0 4 Low Crime State
## Iowa 2.2 56 57 11.3 3 Dangerous State
## Kansas 6.0 115 66 18.0 4 Low Crime State
## Kentucky 9.7 109 52 16.3 3 Dangerous State
## Louisiana 15.4 249 66 22.2 1 Safe State
## Maine 2.1 83 51 7.8 3 Dangerous State
## Maryland 11.3 300 67 27.8 2 High Crime State
## Massachusetts 4.4 149 85 16.3 4 Low Crime State
## Michigan 12.1 255 74 35.1 2 High Crime State
## Minnesota 2.7 72 66 14.9 3 Dangerous State
## Mississippi 16.1 259 44 17.1 1 Safe State
## Missouri 9.0 178 70 28.2 2 High Crime State
## Montana 6.0 109 53 16.4 3 Dangerous State
## Nebraska 4.3 102 62 16.5 3 Dangerous State
## Nevada 12.2 252 81 46.0 2 High Crime State
## New Hampshire 2.1 57 56 9.5 3 Dangerous State
## New Jersey 7.4 159 89 18.8 4 Low Crime State
## New Mexico 11.4 285 70 32.1 2 High Crime State
## New York 11.1 254 86 26.1 2 High Crime State
## North Carolina 13.0 337 45 16.1 1 Safe State
## North Dakota 0.8 45 44 7.3 3 Dangerous State
## Ohio 7.3 120 75 21.4 4 Low Crime State
## Oklahoma 6.6 151 68 20.0 4 Low Crime State
## Oregon 4.9 159 67 29.3 4 Low Crime State
## Pennsylvania 6.3 106 72 14.9 4 Low Crime State
## Rhode Island 3.4 174 87 8.3 4 Low Crime State
## South Carolina 14.4 279 48 22.5 1 Safe State
## South Dakota 3.8 86 45 12.8 3 Dangerous State
## Tennessee 13.2 188 59 26.9 1 Safe State
## Texas 12.7 201 80 25.5 2 High Crime State
## Utah 3.2 120 80 22.9 4 Low Crime State
## Vermont 2.2 48 32 11.2 3 Dangerous State
## Virginia 8.5 156 63 20.7 4 Low Crime State
## Washington 4.0 145 73 26.2 4 Low Crime State
## West Virginia 5.7 81 39 9.3 3 Dangerous State
## Wisconsin 2.6 53 66 10.8 3 Dangerous State
## Wyoming 6.8 161 60 15.6 4 Low Crime State
labels <- labels %>%
mutate(cluster_name = as.factor(cluster_name)) %>%
select(-cluster)
print(labels)
## Murder Assault UrbanPop Rape cluster_name
## Alabama 13.2 236 58 21.2 Safe State
## Alaska 10.0 263 48 44.5 High Crime State
## Arizona 8.1 294 80 31.0 High Crime State
## Arkansas 8.8 190 50 19.5 Safe State
## California 9.0 276 91 40.6 High Crime State
## Colorado 7.9 204 78 38.7 High Crime State
## Connecticut 3.3 110 77 11.1 Low Crime State
## Delaware 5.9 238 72 15.8 Low Crime State
## Florida 15.4 335 80 31.9 High Crime State
## Georgia 17.4 211 60 25.8 Safe State
## Hawaii 5.3 46 83 20.2 Low Crime State
## Idaho 2.6 120 54 14.2 Dangerous State
## Illinois 10.4 249 83 24.0 High Crime State
## Indiana 7.2 113 65 21.0 Low Crime State
## Iowa 2.2 56 57 11.3 Dangerous State
## Kansas 6.0 115 66 18.0 Low Crime State
## Kentucky 9.7 109 52 16.3 Dangerous State
## Louisiana 15.4 249 66 22.2 Safe State
## Maine 2.1 83 51 7.8 Dangerous State
## Maryland 11.3 300 67 27.8 High Crime State
## Massachusetts 4.4 149 85 16.3 Low Crime State
## Michigan 12.1 255 74 35.1 High Crime State
## Minnesota 2.7 72 66 14.9 Dangerous State
## Mississippi 16.1 259 44 17.1 Safe State
## Missouri 9.0 178 70 28.2 High Crime State
## Montana 6.0 109 53 16.4 Dangerous State
## Nebraska 4.3 102 62 16.5 Dangerous State
## Nevada 12.2 252 81 46.0 High Crime State
## New Hampshire 2.1 57 56 9.5 Dangerous State
## New Jersey 7.4 159 89 18.8 Low Crime State
## New Mexico 11.4 285 70 32.1 High Crime State
## New York 11.1 254 86 26.1 High Crime State
## North Carolina 13.0 337 45 16.1 Safe State
## North Dakota 0.8 45 44 7.3 Dangerous State
## Ohio 7.3 120 75 21.4 Low Crime State
## Oklahoma 6.6 151 68 20.0 Low Crime State
## Oregon 4.9 159 67 29.3 Low Crime State
## Pennsylvania 6.3 106 72 14.9 Low Crime State
## Rhode Island 3.4 174 87 8.3 Low Crime State
## South Carolina 14.4 279 48 22.5 Safe State
## South Dakota 3.8 86 45 12.8 Dangerous State
## Tennessee 13.2 188 59 26.9 Safe State
## Texas 12.7 201 80 25.5 High Crime State
## Utah 3.2 120 80 22.9 Low Crime State
## Vermont 2.2 48 32 11.2 Dangerous State
## Virginia 8.5 156 63 20.7 Low Crime State
## Washington 4.0 145 73 26.2 Low Crime State
## West Virginia 5.7 81 39 9.3 Dangerous State
## Wisconsin 2.6 53 66 10.8 Dangerous State
## Wyoming 6.8 161 60 15.6 Low Crime State
# split the data
set.seed(123)
training <- createDataPartition(labels$cluster_name, p = 0.8, list = FALSE)
training_data <- labels[training, ]
test_data <- labels[-training, ]
model <- train(
cluster_name ~ .,
data = training_data,
method = 'rf',
preProcess = c('scale', 'center'),
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(mtry = c(2, 4, 6))
)
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
training_results <- predict(model, training_data)
test_results <- predict(model, test_data)
# Confusion matrix for training data
cmtr <- confusionMatrix(training_results, training_data$cluster_name)
print(cmtr)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Dangerous State High Crime State Low Crime State Safe State
## Dangerous State 11 0 0 0
## High Crime State 0 11 0 0
## Low Crime State 0 0 13 0
## Safe State 0 0 0 7
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9159, 1)
## No Information Rate : 0.3095
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Dangerous State Class: High Crime State
## Sensitivity 1.0000 1.0000
## Specificity 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000
## Prevalence 0.2619 0.2619
## Detection Rate 0.2619 0.2619
## Detection Prevalence 0.2619 0.2619
## Balanced Accuracy 1.0000 1.0000
## Class: Low Crime State Class: Safe State
## Sensitivity 1.0000 1.0000
## Specificity 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000
## Prevalence 0.3095 0.1667
## Detection Rate 0.3095 0.1667
## Detection Prevalence 0.3095 0.1667
## Balanced Accuracy 1.0000 1.0000
# Confusion matrix for testing data
cmts <- confusionMatrix(test_results, test_data$cluster_name)
print(cmts)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Dangerous State High Crime State Low Crime State Safe State
## Dangerous State 1 0 0 0
## High Crime State 0 2 0 0
## Low Crime State 1 0 3 0
## Safe State 0 0 0 1
##
## Overall Statistics
##
## Accuracy : 0.875
## 95% CI : (0.4735, 0.9968)
## No Information Rate : 0.375
## P-Value [Acc > NIR] : 0.005605
##
## Kappa : 0.8222
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Dangerous State Class: High Crime State
## Sensitivity 0.5000 1.00
## Specificity 1.0000 1.00
## Pos Pred Value 1.0000 1.00
## Neg Pred Value 0.8571 1.00
## Prevalence 0.2500 0.25
## Detection Rate 0.1250 0.25
## Detection Prevalence 0.1250 0.25
## Balanced Accuracy 0.7500 1.00
## Class: Low Crime State Class: Safe State
## Sensitivity 1.000 1.000
## Specificity 0.800 1.000
## Pos Pred Value 0.750 1.000
## Neg Pred Value 1.000 1.000
## Prevalence 0.375 0.125
## Detection Rate 0.375 0.125
## Detection Prevalence 0.500 0.125
## Balanced Accuracy 0.900 1.000