# Importar librerias
library(tidyverse)
library(stats)
library(dplyr)
library(ggplot2)
library(GGally)
library(corrplot)
library(lattice)
library(pacman)
library(MASS)
library(FactoMineR)
library(gplots)
library(factoextra)
library(ade4)
library(psych)
library(cluster)
library(NbClust)
library(kableExtra)
library(knitr)
library(DT) # data tables
library(reticulate)
options(scipen = 999)
options(digits = 3)
Análisis de Conglomerados (cluster)
Descripción de los datos
Título: Auto mpg (Automóvil mpg)
Enlace: https://archive.ics.uci.edu/dataset/9/auto+mpg
Los datos se refieren al consumo de combustible mpg (millas por galón) de automóviles en ciclo urbano.
Variables:
mpg - Continua: millas recorridas por galón de combustible consumido.
cylinders (Cilindros) - Entero: Número de cilindros del motor.
displacement (Desplazamiento) - Continua: Volumen total de los cilindros del motor. Indica el tamaño del motor. unidades cúbicas.
horsepower (Caballos de fuerza) - Continua: Medida de la potencia del motor.
weight (Peso) - Continua: Peso del automóvil. libras.
acceleration (Aceleración) - Continua: Tiempo en segundos que tarda el automóvil en pasar de 0 a 60 millas por hora. millas por hora.
model year (Año del modelo) - Entero: Año de fabricación.
origin (Origen) - Entero: País o región.
car name (Nombre del automóvil) - Texto: Nombre del automóvil, marca y modelo.
Aplicación R
Exploración inicial
<- file.choose()
ruta <- read.csv(ruta, header=TRUE, sep=";")
base
# estructura
str(base)
'data.frame': 400 obs. of 9 variables:
$ mpg : chr "18.0" "15.0" "18.0" "16.0" ...
$ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
$ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
$ horsepower : chr "130.0" "165.0" "150.0" "150.0" ...
$ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
$ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
$ modelYear : num 70 70 70 70 70 70 70 70 70 70 ...
$ origin : int 1 1 1 1 1 1 1 1 1 1 ...
$ carName : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
# convertir variables char que realmente son númericas
$mpg <- as.numeric(base$mpg)
base$horsepower <- as.numeric(base$horsepower)
base
# Valores nulos
paste("Total de valores perdidos:", sum(is.na(base)))
[1] "Total de valores perdidos: 40"
<- na.omit(base)
base <- base # base2 contendrá las varibles númericas sin valoreles nulos
base2
$modelYear <- NULL
base2$origin <- NULL
base2$carName <- NULL
base2
# Comprobar estructura y datos
str(base2)
'data.frame': 392 obs. of 6 variables:
$ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
$ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
$ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
$ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
$ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
$ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
- attr(*, "na.action")= 'omit' Named int [1:8] 329 330 336 337 355 356 376 377
..- attr(*, "names")= chr [1:8] "329" "330" "336" "337" ...
head(base2)
mpg cylinders displacement horsepower weight acceleration
1 18 8 307 130 3504 12.0
2 15 8 350 165 3693 11.5
3 18 8 318 150 3436 11.0
4 16 8 304 150 3433 12.0
5 17 8 302 140 3449 10.5
6 15 8 429 198 4341 10.0
tail(base2)
mpg cylinders displacement horsepower weight acceleration
395 27 4 151 90 2950 17.3
396 27 4 140 86 2790 15.6
397 44 4 97 52 2130 24.6
398 32 4 135 84 2295 11.6
399 28 4 120 79 2625 18.6
400 31 4 119 82 2720 19.4
kable(rbind(head(base2),tail(base2))) %>% kable_styling()
mpg | cylinders | displacement | horsepower | weight | acceleration | |
---|---|---|---|---|---|---|
1 | 18 | 8 | 307 | 130 | 3504 | 12.0 |
2 | 15 | 8 | 350 | 165 | 3693 | 11.5 |
3 | 18 | 8 | 318 | 150 | 3436 | 11.0 |
4 | 16 | 8 | 304 | 150 | 3433 | 12.0 |
5 | 17 | 8 | 302 | 140 | 3449 | 10.5 |
6 | 15 | 8 | 429 | 198 | 4341 | 10.0 |
395 | 27 | 4 | 151 | 90 | 2950 | 17.3 |
396 | 27 | 4 | 140 | 86 | 2790 | 15.6 |
397 | 44 | 4 | 97 | 52 | 2130 | 24.6 |
398 | 32 | 4 | 135 | 84 | 2295 | 11.6 |
399 | 28 | 4 | 120 | 79 | 2625 | 18.6 |
400 | 31 | 4 | 119 | 82 | 2720 | 19.4 |
# resumen
summary(base)
mpg cylinders displacement horsepower weight
Min. : 9.0 Min. :3.00 Min. : 68 Min. : 46.0 Min. :1613
1st Qu.:17.0 1st Qu.:4.00 1st Qu.:105 1st Qu.: 75.0 1st Qu.:2225
Median :22.8 Median :4.00 Median :151 Median : 93.5 Median :2804
Mean :23.4 Mean :5.47 Mean :194 Mean :104.5 Mean :2978
3rd Qu.:29.0 3rd Qu.:8.00 3rd Qu.:276 3rd Qu.:126.0 3rd Qu.:3615
Max. :46.6 Max. :8.00 Max. :455 Max. :230.0 Max. :5140
acceleration modelYear origin carName
Min. : 8.0 Min. :70 Min. :1.00 Length:392
1st Qu.:13.8 1st Qu.:73 1st Qu.:1.00 Class :character
Median :15.5 Median :76 Median :1.00 Mode :character
Mean :15.5 Mean :76 Mean :1.58
3rd Qu.:17.0 3rd Qu.:79 3rd Qu.:2.00
Max. :24.8 Max. :82 Max. :3.00
kable(describe(base2)) %>% kable_styling()
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mpg | 1 | 392 | 23.45 | 7.80 | 22.8 | 22.99 | 8.60 | 9 | 46.6 | 37.6 | 0.454 | -0.537 | 0.394 |
cylinders | 2 | 392 | 5.47 | 1.71 | 4.0 | 5.35 | 0.00 | 3 | 8.0 | 5.0 | 0.504 | -1.404 | 0.086 |
displacement | 3 | 392 | 194.41 | 104.64 | 151.0 | 183.83 | 90.44 | 68 | 455.0 | 387.0 | 0.696 | -0.795 | 5.285 |
horsepower | 4 | 392 | 104.47 | 38.49 | 93.5 | 99.82 | 28.91 | 46 | 230.0 | 184.0 | 1.079 | 0.654 | 1.944 |
weight | 5 | 392 | 2977.58 | 849.40 | 2803.5 | 2916.94 | 948.12 | 1613 | 5140.0 | 3527.0 | 0.516 | -0.825 | 42.901 |
acceleration | 6 | 392 | 15.54 | 2.76 | 15.5 | 15.48 | 2.52 | 8 | 24.8 | 16.8 | 0.289 | 0.406 | 0.139 |
#correlaciones
cor(base2)
mpg cylinders displacement horsepower weight acceleration
mpg 1.000 -0.778 -0.805 -0.778 -0.832 0.423
cylinders -0.778 1.000 0.951 0.843 0.898 -0.505
displacement -0.805 0.951 1.000 0.897 0.933 -0.544
horsepower -0.778 0.843 0.897 1.000 0.865 -0.689
weight -0.832 0.898 0.933 0.865 1.000 -0.417
acceleration 0.423 -0.505 -0.544 -0.689 -0.417 1.000
kable(cor(base2)) %>% kable_styling()
mpg | cylinders | displacement | horsepower | weight | acceleration | |
---|---|---|---|---|---|---|
mpg | 1.000 | -0.778 | -0.805 | -0.778 | -0.832 | 0.423 |
cylinders | -0.778 | 1.000 | 0.951 | 0.843 | 0.898 | -0.505 |
displacement | -0.805 | 0.951 | 1.000 | 0.897 | 0.933 | -0.544 |
horsepower | -0.778 | 0.843 | 0.897 | 1.000 | 0.865 | -0.689 |
weight | -0.832 | 0.898 | 0.933 | 0.865 | 1.000 | -0.417 |
acceleration | 0.423 | -0.505 | -0.544 | -0.689 | -0.417 | 1.000 |
# mapa de calor correlacones
corrplot(cor(base2),method = "color",type = "upper",tl.cex = 0.8,tl.col = "black",tl.srt = 45,addCoef.col = "black",number.cex = 0.7,mar = c(1, 1, 2, 1), title = "Mapa de calor - Matriz de correlaciones")
# Histogramas de las variables
hist(base2$mpg, col="skyblue4", main="Histograma de 'mpg'", xlab="mpg", ylab="Fracuencia")
hist(base2$cylinders, col="skyblue4", main="Histograma de 'cylinders'", xlab="cylinders", ylab="Fracuencia")
hist(base2$displacement, col="skyblue4", main="Histograma de 'displacement'", xlab="displacement", ylab="Fracuencia")
hist(base2$horsepower, col="skyblue4", main="Histograma de 'horsepower'", xlab="horsepower", ylab="Fracuencia")
hist(base2$weight, col="skyblue4", main="Histograma de 'weight'", xlab="weight", ylab="Fracuencia")
hist(base2$acceleration, col="skyblue4", main="Histograma de 'acceleration'", xlab="acceleration", ylab="Fracuencia")
Datos atípicos
Detección de datos atípicos
<- mahalanobis(base2, colMeans(base2), cov(base2))
mahalanobis_dist
# Valor crítico para identificar outliers (usando chi-cuadrado)
<- ncol(base2) # grados de libertad (número de variables)
gl <- 0.05 # Nivel de significancia
alpha <- qchisq(1 - alpha, df = gl)
valor_critico
#ver valor crítico valor_critico
[1] 12.6
<- which(mahalanobis_dist > valor_critico) # Identificar outliers
atipicos
head(mahalanobis_dist) #ver distancias
1 2 3 4 5 6
6.05 5.49 6.23 6.59 7.42 12.62
length(atipicos)
[1] 38
Estandarizar los datos
<- scale(base2)
base2scale <- as.data.frame(base2scale)
base2scale
kable(rbind(head(base2scale),tail(base2scale))) %>% kable_styling()
mpg | cylinders | displacement | horsepower | weight | acceleration | |
---|---|---|---|---|---|---|
1 | -0.698 | 1.482 | 1.076 | 0.663 | 0.620 | -1.284 |
2 | -1.082 | 1.482 | 1.487 | 1.573 | 0.842 | -1.465 |
3 | -0.698 | 1.482 | 1.181 | 1.183 | 0.540 | -1.646 |
4 | -0.954 | 1.482 | 1.047 | 1.183 | 0.536 | -1.284 |
5 | -0.826 | 1.482 | 1.028 | 0.923 | 0.555 | -1.827 |
6 | -1.082 | 1.482 | 2.242 | 2.430 | 1.605 | -2.009 |
395 | 0.455 | -0.863 | -0.415 | -0.376 | -0.032 | 0.637 |
396 | 0.455 | -0.863 | -0.520 | -0.480 | -0.221 | 0.021 |
397 | 2.633 | -0.863 | -0.931 | -1.363 | -0.998 | 3.283 |
398 | 1.096 | -0.863 | -0.568 | -0.532 | -0.804 | -1.429 |
399 | 0.583 | -0.863 | -0.711 | -0.662 | -0.415 | 1.109 |
400 | 0.968 | -0.863 | -0.721 | -0.584 | -0.303 | 1.399 |
# resumen
summary(base2scale)
mpg cylinders displacement horsepower
Min. :-1.851 Min. :-1.449 Min. :-1.208 Min. :-1.52
1st Qu.:-0.826 1st Qu.:-0.863 1st Qu.:-0.854 1st Qu.:-0.77
Median :-0.089 Median :-0.863 Median :-0.415 Median :-0.28
Mean : 0.000 Mean : 0.000 Mean : 0.000 Mean : 0.00
3rd Qu.: 0.712 3rd Qu.: 1.482 3rd Qu.: 0.777 3rd Qu.: 0.56
Max. : 2.967 Max. : 1.482 Max. : 2.490 Max. : 3.26
weight acceleration
Min. :-1.607 Min. :-2.73
1st Qu.:-0.886 1st Qu.:-0.64
Median :-0.205 Median :-0.01
Mean : 0.000 Mean : 0.00
3rd Qu.: 0.750 3rd Qu.: 0.54
Max. : 2.546 Max. : 3.36
kable(describe(base2scale)) %>% kable_styling()
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mpg | 1 | 392 | 0 | 1 | -0.089 | -0.058 | 1.102 | -1.85 | 2.97 | 4.82 | 0.454 | -0.537 | 0.051 |
cylinders | 2 | 392 | 0 | 1 | -0.863 | -0.069 | 0.000 | -1.45 | 1.48 | 2.93 | 0.504 | -1.404 | 0.051 |
displacement | 3 | 392 | 0 | 1 | -0.415 | -0.101 | 0.864 | -1.21 | 2.49 | 3.70 | 0.696 | -0.795 | 0.051 |
horsepower | 4 | 392 | 0 | 1 | -0.285 | -0.121 | 0.751 | -1.52 | 3.26 | 4.78 | 1.079 | 0.654 | 0.051 |
weight | 5 | 392 | 0 | 1 | -0.205 | -0.071 | 1.116 | -1.61 | 2.55 | 4.15 | 0.516 | -0.825 | 0.051 |
acceleration | 6 | 392 | 0 | 1 | -0.015 | -0.023 | 0.914 | -2.73 | 3.36 | 6.09 | 0.289 | 0.406 | 0.051 |
Distancias Euclideas
<- dist(base2scale, method = "euclidean")
distancias <- dist(base2scale[1:20,], method = "euclidean")
distancias20
# matriz con las distancais de la primeras 20 observaciones
kable(as.matrix(distancias20)) %>% kable_styling()
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0.000 | 1.107 | 0.647 | 0.586 | 0.621 | 2.475 | 3.144 | 3.044 | 3.171 | 2.232 | 1.512 | 1.757 | 1.456 | 3.02 | 3.649 | 2.405 | 2.350 | 2.684 | 3.935 | 5.36 |
1.107 | 0.000 | 0.719 | 0.699 | 0.955 | 1.477 | 2.115 | 2.039 | 2.119 | 1.336 | 0.660 | 1.289 | 0.955 | 2.06 | 4.392 | 3.275 | 3.162 | 3.563 | 4.720 | 6.19 |
0.647 | 0.719 | 0.000 | 0.464 | 0.375 | 2.024 | 2.637 | 2.522 | 2.695 | 1.661 | 0.978 | 1.264 | 1.097 | 2.46 | 3.955 | 2.856 | 2.799 | 3.151 | 4.226 | 5.81 |
0.586 | 0.699 | 0.464 | 0.000 | 0.617 | 2.160 | 2.790 | 2.699 | 2.799 | 1.903 | 1.185 | 1.548 | 1.352 | 2.58 | 3.830 | 2.672 | 2.560 | 2.953 | 4.143 | 5.60 |
0.621 | 0.955 | 0.375 | 0.617 | 0.000 | 2.224 | 2.830 | 2.691 | 2.918 | 1.791 | 1.150 | 1.186 | 1.130 | 2.72 | 3.899 | 2.820 | 2.743 | 3.104 | 4.161 | 5.78 |
2.475 | 1.477 | 2.024 | 2.160 | 2.224 | 0.000 | 0.729 | 0.721 | 0.762 | 0.901 | 1.250 | 1.727 | 1.460 | 1.66 | 5.636 | 4.619 | 4.541 | 4.936 | 5.961 | 7.51 |
3.144 | 2.115 | 2.637 | 2.790 | 2.830 | 0.729 | 0.000 | 0.265 | 0.394 | 1.176 | 1.778 | 2.125 | 2.027 | 1.54 | 6.203 | 5.264 | 5.177 | 5.580 | 6.519 | 8.13 |
3.044 | 2.039 | 2.522 | 2.699 | 2.691 | 0.721 | 0.265 | 0.000 | 0.634 | 0.981 | 1.659 | 1.916 | 1.888 | 1.57 | 6.115 | 5.193 | 5.104 | 5.510 | 6.421 | 8.08 |
3.171 | 2.119 | 2.695 | 2.799 | 2.918 | 0.762 | 0.394 | 0.634 | 0.000 | 1.408 | 1.887 | 2.347 | 2.176 | 1.58 | 6.193 | 5.230 | 5.141 | 5.546 | 6.527 | 8.06 |
2.232 | 1.336 | 1.661 | 1.903 | 1.791 | 0.901 | 1.176 | 0.981 | 1.408 | 0.000 | 0.827 | 0.983 | 1.110 | 1.53 | 5.339 | 4.417 | 4.328 | 4.723 | 5.617 | 7.34 |
1.512 | 0.660 | 0.978 | 1.185 | 1.150 | 1.250 | 1.778 | 1.659 | 1.887 | 0.827 | 0.000 | 0.884 | 0.619 | 1.69 | 4.747 | 3.717 | 3.612 | 3.994 | 5.038 | 6.63 |
1.757 | 1.289 | 1.264 | 1.548 | 1.186 | 1.727 | 2.125 | 1.916 | 2.347 | 0.983 | 0.884 | 0.000 | 0.860 | 2.23 | 4.822 | 3.917 | 3.806 | 4.198 | 5.074 | 6.85 |
1.456 | 0.955 | 1.097 | 1.352 | 1.130 | 1.460 | 2.027 | 1.888 | 2.176 | 1.110 | 0.619 | 0.860 | 0.000 | 2.18 | 4.800 | 3.724 | 3.632 | 3.991 | 5.081 | 6.65 |
3.020 | 2.064 | 2.465 | 2.576 | 2.716 | 1.659 | 1.542 | 1.571 | 1.576 | 1.528 | 1.687 | 2.228 | 2.180 | 0.00 | 5.764 | 4.892 | 4.774 | 5.140 | 6.048 | 7.61 |
3.649 | 4.392 | 3.955 | 3.830 | 3.899 | 5.636 | 6.203 | 6.115 | 6.193 | 5.339 | 4.747 | 4.822 | 4.800 | 5.76 | 0.000 | 1.558 | 1.703 | 1.574 | 0.564 | 2.47 |
2.405 | 3.275 | 2.856 | 2.672 | 2.820 | 4.619 | 5.264 | 5.193 | 5.230 | 4.417 | 3.717 | 3.917 | 3.724 | 4.89 | 1.558 | 0.000 | 0.520 | 0.448 | 1.888 | 2.98 |
2.350 | 3.162 | 2.799 | 2.560 | 2.743 | 4.541 | 5.177 | 5.104 | 5.141 | 4.328 | 3.612 | 3.806 | 3.632 | 4.77 | 1.703 | 0.520 | 0.000 | 0.571 | 2.101 | 3.10 |
2.684 | 3.563 | 3.151 | 2.953 | 3.104 | 4.936 | 5.580 | 5.510 | 5.546 | 4.723 | 3.994 | 4.198 | 3.991 | 5.14 | 1.574 | 0.448 | 0.571 | 0.000 | 1.878 | 2.69 |
3.935 | 4.720 | 4.226 | 4.143 | 4.161 | 5.961 | 6.519 | 6.421 | 6.527 | 5.617 | 5.038 | 5.074 | 5.081 | 6.05 | 0.564 | 1.888 | 2.101 | 1.878 | 0.000 | 2.46 |
5.357 | 6.190 | 5.807 | 5.602 | 5.777 | 7.506 | 8.131 | 8.081 | 8.061 | 7.339 | 6.630 | 6.850 | 6.649 | 7.61 | 2.467 | 2.976 | 3.104 | 2.688 | 2.461 | 0.00 |
fviz_dist(distancias20)
Cluster Jerarquico - Método de Ward
set.seed(252)
<- hclust(distancias, method="ward.D2") clusterW
Dendogramas
# Mapa de calor y dendograma
# primeras 20 observaciones
heatmap(as.matrix(distancias20), xlab = "Observaciones",
ylab = "Observaciones",
main = "Mapa de Calor - Distancias euclídeas")
# Dendograma
fviz_dend(clusterW, cex=0.6, rect = FALSE, labels_track_height = 5.5) + labs(title = "Dendograma", subtitle = "Método de Ward.")
# dendograma 4 clusters
fviz_dend(clusterW, cex = 0.6, k = 4, k_colors = "black", labels_track_height = 5.5, rect = TRUE, rect_border = "jco", rect_fill = TRUE) + labs(title = "Dendograma",subtitle = "Método de Ward.") + geom_hline(yintercept = 12, linetype = "dashed", color = "red")
# árbol radial
fviz_dend(clusterW, cex = 0.5, type = "circular") + labs(title = "Dendograma circular", subtitle = "Método de Ward.")
Número óptimo de grupos
# Gráfico con método de silueta
fviz_nbclust(base2scale, FUN = hcut, method = "silhouette") + labs(title = "Número óptimo de clústeres (Silueta - Métodod de Ward)")
# función para múltiples pruebas
<- NbClust(base2scale, distance = "euclidean", min.nc = 2, max.nc = 6, method = "ward.D2") Ncluster
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 8 proposed 2 as the best number of clusters
* 13 proposed 3 as the best number of clusters
* 1 proposed 6 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 3
*******************************************************************
# Número cluster sugeridos en cada prueba
$Best.nc Ncluster
KL CH Hartigan CCC Scott Marriot TrCovW TraceW
Number_clusters 3.00 2 3 3.00 3 3 3 3
Value_Index 4.99 580 145 2.98 967 579838673324 34659 243
Friedman Rubin Cindex DB Silhouette Duda PseudoT2 Beale
Number_clusters 3.0 3.000 6.000 2.000 2.000 NA NA 3.0
Value_Index 46.2 -0.665 0.278 0.657 0.544 NA NA 1.4
Ratkowsky Ball PtBiserial Frey McClain Dunn Hubert SDindex
Number_clusters 2.000 3 2.000 3.00 2.000 2.000 0 2.00
Value_Index 0.542 270 0.737 2.59 0.272 0.168 0 1.09
Dindex SDbw
Number_clusters 0 3.000
Value_Index 0 0.321
kable(as.data.frame(Ncluster$Best.nc)) %>% kable_styling()
KL | CH | Hartigan | CCC | Scott | Marriot | TrCovW | TraceW | Friedman | Rubin | Cindex | DB | Silhouette | Duda | PseudoT2 | Beale | Ratkowsky | Ball | PtBiserial | Frey | McClain | Dunn | Hubert | SDindex | Dindex | SDbw | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Number_clusters | 3.00 | 2 | 3 | 3.00 | 3 | 3 | 3 | 3 | 3.0 | 3.000 | 6.000 | 2.000 | 2.000 | NA | NA | 3.0 | 2.000 | 3 | 2.000 | 3.0 | 2.000 | 2.000 | 0 | 2.00 | 0 | 3.000 |
Value_Index | 4.99 | 580 | 145 | 2.98 | 967 | 579838673324 | 34659 | 243 | 46.2 | -0.665 | 0.278 | 0.657 | 0.544 | NA | NA | 1.4 | 0.542 | 270 | 0.737 | 2.6 | 0.272 | 0.168 | 0 | 1.09 | 0 | 0.321 |
Identificar observaciones en grupos(clusters)
# guardar la nueva columa(cluster) en un nuevo data frame
<- base2
base3
# Agergar la columna
$car <- base$carName # columna para identificar la observación
base3$cluster <- as.factor(cutree(clusterW, k=3))
base3
# número de grupos
levels(base3$cluster)
[1] "1" "2" "3"
# Visualizar (priemras 10 y últimas 10 observaciones)
kable(rbind(head(base3,10),tail(base3,10))) %>% kable_styling()
mpg | cylinders | displacement | horsepower | weight | acceleration | car | cluster | |
---|---|---|---|---|---|---|---|---|
1 | 18 | 8 | 307 | 130 | 3504 | 12.0 | chevrolet chevelle malibu | 1 |
2 | 15 | 8 | 350 | 165 | 3693 | 11.5 | buick skylark 320 | 1 |
3 | 18 | 8 | 318 | 150 | 3436 | 11.0 | plymouth satellite | 1 |
4 | 16 | 8 | 304 | 150 | 3433 | 12.0 | amc rebel sst | 1 |
5 | 17 | 8 | 302 | 140 | 3449 | 10.5 | ford torino | 1 |
6 | 15 | 8 | 429 | 198 | 4341 | 10.0 | ford galaxie 500 | 1 |
7 | 14 | 8 | 454 | 220 | 4354 | 9.0 | chevrolet impala | 1 |
8 | 14 | 8 | 440 | 215 | 4312 | 8.5 | plymouth fury iii | 1 |
9 | 14 | 8 | 455 | 225 | 4425 | 10.0 | pontiac catalina | 1 |
10 | 15 | 8 | 390 | 190 | 3850 | 8.5 | amc ambassador dpl | 1 |
391 | 26 | 4 | 156 | 92 | 2585 | 14.5 | chrysler lebaron medallion | 2 |
392 | 22 | 6 | 232 | 112 | 2835 | 14.7 | ford granada l | 3 |
393 | 32 | 4 | 144 | 96 | 2665 | 13.9 | toyota celica gt | 2 |
394 | 36 | 4 | 135 | 84 | 2370 | 13.0 | dodge charger 2.2 | 2 |
395 | 27 | 4 | 151 | 90 | 2950 | 17.3 | chevrolet camaro | 2 |
396 | 27 | 4 | 140 | 86 | 2790 | 15.6 | ford mustang gl | 2 |
397 | 44 | 4 | 97 | 52 | 2130 | 24.6 | vw pickup | 2 |
398 | 32 | 4 | 135 | 84 | 2295 | 11.6 | dodge rampage | 2 |
399 | 28 | 4 | 120 | 79 | 2625 | 18.6 | ford ranger | 2 |
400 | 31 | 4 | 119 | 82 | 2720 | 19.4 | chevy s-10 | 2 |
# Tabla culster y número de observaciones
<- as.data.frame(table(cutree(clusterW, k=3)))
obs_cluster colnames(obs_cluster) <- c("Cluster", "N observaciones")
# Mostrar tabla
kable(obs_cluster, caption = "Observaciones por grupo") %>% kable_styling()
Cluster | N observaciones |
---|---|
1 | 99 |
2 | 208 |
3 | 85 |
PCA y cluster jerárquico
Aplicar PCA
# Analisis de componenetes principales usando martiz de correlaciones (datos estandarizados)
<- dudi.pca(base2scale, scannf=FALSE, scale=TRUE, nf=6)
acp
# Autovalores
$eig acp
[1] 4.7883 0.7286 0.2585 0.1252 0.0632 0.0363
kable(as.data.frame(t(acp$eig)), col.names = paste0("CP", 1:length(acp$eig))) %>% kable_styling()
CP1 | CP2 | CP3 | CP4 | CP5 | CP6 |
---|---|---|---|---|---|
4.79 | 0.729 | 0.258 | 0.125 | 0.063 | 0.036 |
# Resumen de componenetes y varianzas acumuladas
summary(acp)
Class: pca dudi
Call: dudi.pca(df = base2scale, scale = TRUE, scannf = FALSE, nf = 6)
Total inertia: 6
Eigenvalues:
Ax1 Ax2 Ax3 Ax4 Ax5
4.78827 0.72863 0.25847 0.12518 0.06318
Projected inertia (%):
Ax1 Ax2 Ax3 Ax4 Ax5
79.804 12.144 4.308 2.086 1.053
Cumulative projected inertia (%):
Ax1 Ax1:2 Ax1:3 Ax1:4 Ax1:5
79.80 91.95 96.26 98.34 99.40
(Only 5 dimensions (out of 6) are shown)
# Autovectores
$c1 acp
CS1 CS2 CS3 CS4 CS5 CS6
mpg 0.399 0.245 0.8521 -0.221 -0.07109 -0.0312
cylinders -0.431 -0.148 0.4003 0.576 0.28590 -0.4674
displacement -0.444 -0.108 0.2975 0.111 -0.00559 0.8311
horsepower -0.434 0.166 0.0226 -0.675 0.55859 -0.1243
weight -0.430 -0.286 0.1247 -0.341 -0.72661 -0.2710
acceleration 0.292 -0.893 0.0953 -0.186 0.27053 0.0318
kable(acp$c1) %>% kable_styling()
CS1 | CS2 | CS3 | CS4 | CS5 | CS6 | |
---|---|---|---|---|---|---|
mpg | 0.399 | 0.245 | 0.852 | -0.221 | -0.071 | -0.031 |
cylinders | -0.431 | -0.148 | 0.400 | 0.576 | 0.286 | -0.467 |
displacement | -0.444 | -0.108 | 0.298 | 0.111 | -0.006 | 0.831 |
horsepower | -0.434 | 0.166 | 0.023 | -0.675 | 0.559 | -0.124 |
weight | -0.430 | -0.286 | 0.125 | -0.341 | -0.727 | -0.271 |
acceleration | 0.292 | -0.893 | 0.095 | -0.186 | 0.271 | 0.032 |
# gráficos de sedimentación
plot(acp$eig,type="b",pch=20,col="royalblue4", xlab = "Número de componentes", ylab = "Varianza explicada")
title("Gráfico de sedimentación")
abline(h=1,lty=3,col="red3")
fviz_eig(acp, addlabels = TRUE, hjust = -0.3,
barfill = "white", barcolor = "royalblue4",
linecolor = "red3") + ylim(0, 80) + labs(title = "Gráfico de sedimentación", subtitle = "Autovalores - matriz de correlaciones", x="Número de componentes", y = "Varianza explicada") + theme_minimal()
# Cargas de las varibles en las componentes
<- acp$co*acp$co
contrib contrib
Comp1 Comp2 Comp3 Comp4 Comp5 Comp6
mpg 0.762 0.04368 0.187671 0.00610 0.00031932 0.0000353
cylinders 0.888 0.01603 0.041421 0.04158 0.00516411 0.0079255
displacement 0.942 0.00858 0.022877 0.00154 0.00000198 0.0250598
horsepower 0.902 0.02012 0.000132 0.05707 0.01971234 0.0005607
weight 0.886 0.05964 0.004019 0.01455 0.03335492 0.0026641
acceleration 0.408 0.58059 0.002346 0.00434 0.00462375 0.0000367
<- as.matrix(contrib)
contrib
corrplot(contrib, is.corr = FALSE, col = colorRampPalette(c("white", "purple4"))(200))
kable(round(contrib,2)) %>% kable_styling()
Comp1 | Comp2 | Comp3 | Comp4 | Comp5 | Comp6 | |
---|---|---|---|---|---|---|
mpg | 0.76 | 0.04 | 0.19 | 0.01 | 0.00 | 0.00 |
cylinders | 0.89 | 0.02 | 0.04 | 0.04 | 0.01 | 0.01 |
displacement | 0.94 | 0.01 | 0.02 | 0.00 | 0.00 | 0.03 |
horsepower | 0.90 | 0.02 | 0.00 | 0.06 | 0.02 | 0.00 |
weight | 0.89 | 0.06 | 0.00 | 0.01 | 0.03 | 0.00 |
acceleration | 0.41 | 0.58 | 0.00 | 0.00 | 0.00 | 0.00 |
# Gráfico de puntuaciones y cargas
fviz_pca_biplot(acp, repel = FALSE, axes=c(1,2), col.var = "gray2", col.ind ="darkorange2")+theme_minimal() +
ggtitle("Puntuaciones (scores) en componenetes 1 y 2")
Gráficos cluster y PCA
# Observaciones en clusters
fviz_cluster(list(data = base2, cluster = cutree(clusterW, k=3)),
palette = c("skyblue3", "firebrick", "purple4"),
ellipse.type = "norm",
repel = F,
show.clust.cent = FALSE,
ggtheme = theme_minimal()) +
labs(title = "Clusters y obervaciones")
# Observaciones en clusters y cargas
fviz_pca_biplot(acp, label="var", habillage=as.factor(cutree(clusterW, k=3)), palette = c("skyblue3", "firebrick", "purple4"), col.var = "black") +
labs(color=NULL) + ggtitle("Clusters, obervaciones y variables") +
theme(text = element_text(size = 15),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
legend.key = element_rect(fill = "white"))
# Todas las observaciones en sus respectivos cluster
<- base
originalesYcluster $cluster <- as.factor(cutree(clusterW, k=3))
originalesYcluster
# tabla interactiva
datatable(originalesYcluster,
options = list(pageLength = 10,
autoWidth = TRUE),
caption = "Observaciones y grupo(cluster) asignado")
Aplicación Python
import numpy as np
import pandas as pd
import csv
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from scipy.spatial.distance import pdist, squareform
import pingouin as pg
from scipy.spatial.distance import mahalanobis
from scipy.stats import chi2
from sklearn.cluster import KMeans
import seaborn as sns
Exploración inicial
=pd.read_csv(r"C:\Users\MINEDUCYT\Documents\Seminario\auto+mpg\auto_mpg_v2.csv", sep=",")
base
'display.max_rows', None) # Mostrar todas las filas
pd.set_option('display.max_columns', None) # Mostrar todas las columnas
pd.set_option(
# información de las variables
base.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mpg 392 non-null float64
1 cylinders 392 non-null int64
2 displacement 392 non-null float64
3 horsepower 392 non-null int64
4 weight 392 non-null int64
5 acceleration 392 non-null float64
6 modelYear 392 non-null int64
7 origin 392 non-null int64
8 carName 392 non-null object
dtypes: float64(3), int64(5), object(1)
memory usage: 27.7+ KB
# Valores nulos:
# Se creó un nuevo archivo csv a partir del data frame "base" en R para importar a python. El cual no contiene valores nulos.
# Nuevo data frame sin las variables cualitativas
=base.copy()
base2
=base2.drop("modelYear", axis=1) # axis=0 filas, axis=1 columnas
base2=base2.drop("origin", axis=1)
base2=base2.drop("carName", axis=1)
base2
# Datos descriptivos
base2.describe()
mpg cylinders displacement horsepower weight \
count 392.000000 392.000000 392.000000 392.000000 392.000000
mean 23.445918 5.471939 194.411990 104.469388 2977.584184
std 7.805007 1.705783 104.644004 38.491160 849.402560
min 9.000000 3.000000 68.000000 46.000000 1613.000000
25% 17.000000 4.000000 105.000000 75.000000 2225.250000
50% 22.750000 4.000000 151.000000 93.500000 2803.500000
75% 29.000000 8.000000 275.750000 126.000000 3614.750000
max 46.600000 8.000000 455.000000 230.000000 5140.000000
acceleration
count 392.000000
mean 15.541327
std 2.758864
min 8.000000
25% 13.775000
50% 15.500000
75% 17.025000
max 24.800000
# Comprobar
base2.head()
mpg cylinders displacement horsepower weight acceleration
0 18.0 8 307.0 130 3504 12.0
1 15.0 8 350.0 165 3693 11.5
2 18.0 8 318.0 150 3436 11.0
3 16.0 8 304.0 150 3433 12.0
4 17.0 8 302.0 140 3449 10.5
base2.tail()
mpg cylinders displacement horsepower weight acceleration
387 27.0 4 140.0 86 2790 15.6
388 44.0 4 97.0 52 2130 24.6
389 32.0 4 135.0 84 2295 11.6
390 28.0 4 120.0 79 2625 18.6
391 31.0 4 119.0 82 2720 19.4
# Correlaciones
# Limpia la figura actual antes de graficar
plt.clf() =(10, 8))
plt.figure(figsize=True, cmap="coolwarm", center=0, linewidths=0.5)
sns.heatmap(base2.corr(), annot"Mapa de calor correlaciones")
plt.title( plt.show()
# Histogramas de las variables
for columna in base2.columns:
plt.clf()=True, bins=15) # kde graficar curva
sns.histplot(base2[columna], kdef'Histograma de {columna}')
plt.title("Frecuencia")
plt.ylabel( plt.show()
Datos atípicos
Detección de datos atípicos
= base2 - base2.mean()
xi_xbar = np.linalg.inv(base2.cov())
inv_cov = np.dot(xi_xbar, inv_cov)
parte1 = np.dot(parte1, xi_xbar.T)
parte2
= parte2.diagonal()
mahalanobis_distprint("Primeras 20 distancias de mahalanobis:\n", mahalanobis_dist[1:20,])
Primeras 20 distancias de mahalanobis:
[ 5.4855309 6.23325333 6.58639283 7.41652595 12.61723731 21.8602741
17.73429663 24.21467952 10.60271274 11.63708548 9.59866958 14.84289216
72.77895434 2.54082699 1.77951952 4.06720816 5.32275947 2.5146385
9.11296795]
# Valor crítico para identificar outliers (usando chi-cuadrado)
= base2.shape[1] # grados de libertad (número de variables)
gl = 0.05 # Nivel de significancia
alpha = chi2.ppf(1 - alpha, df=gl)
valor_critico
print("Valor crítico: ",valor_critico)
Valor crítico: 12.591587243743977
# Identificar outliers
= np.where(mahalanobis_dist > valor_critico)[0]
atipicos
# Mostrar el número de outliers
print("Número de datos atípicos: \n", len(atipicos))
Número de datos atípicos:
38
Estandarizar los datos
# escalador
= StandardScaler()
scaler
# escalar la base
= scaler.fit_transform(base2)
base2Scale
# Dataframe de la base escalada
= pd.DataFrame(base2Scale, columns=base2.columns)
base2Scale
# Base escalada/estandarizada
print(base2Scale.describe())
mpg cylinders displacement horsepower weight \
count 3.920000e+02 3.920000e+02 3.920000e+02 3.920000e+02 3.920000e+02
mean 1.450087e-16 -1.087565e-16 -7.250436e-17 -1.812609e-16 -1.812609e-17
std 1.001278e+00 1.001278e+00 1.001278e+00 1.001278e+00 1.001278e+00
min -1.853218e+00 -1.451004e+00 -1.209563e+00 -1.520975e+00 -1.608575e+00
25% -8.269250e-01 -8.640136e-01 -8.555316e-01 -7.665929e-01 -8.868535e-01
50% -8.927701e-02 -8.640136e-01 -4.153842e-01 -2.853488e-01 -2.052109e-01
75% 7.125143e-01 1.483947e+00 7.782764e-01 5.600800e-01 7.510927e-01
max 2.970359e+00 1.483947e+00 2.493416e+00 3.265452e+00 2.549061e+00
acceleration
count 3.920000e+02
mean 4.350262e-16
std 1.001278e+00
min -2.736983e+00
25% -6.410551e-01
50% -1.499869e-02
75% 5.384714e-01
max 3.360262e+00
Distancias euclídeas
= pdist(base2Scale, metric='euclidean')
distancias = squareform(distancias)
matrix_dist
print("Matriz de distancias euclídeas - primeras 10 observaciones (datos estandarizados):\n", matrix_dist[0:10, 0:10])
Matriz de distancias euclídeas - primeras 10 observaciones (datos estandarizados):
[[0. 1.10856173 0.64799465 0.58679906 0.62208226 2.47793946
3.14838417 3.04838491 3.17512164 2.23490958]
[1.10856173 0. 0.72030087 0.69950458 0.95611168 1.47924318
2.11756175 2.04148692 2.12199296 1.33758728]
[0.64799465 0.72030087 0. 0.46422637 0.37513822 2.02609567
2.64014814 2.52479498 2.69795502 1.66338404]
[0.58679906 0.69950458 0.46422637 0. 0.61742688 2.16301084
2.79387535 2.70285523 2.80250528 1.90564543]
[0.62208226 0.95611168 0.37513822 0.61742688 0. 2.22653382
2.83348785 2.69440784 2.9218119 1.79356246]
[2.47793946 1.47924318 2.02609567 2.16301084 2.22653382 0.
0.73017065 0.72155003 0.76253361 0.90218114]
[3.14838417 2.11756175 2.64014814 2.79387535 2.83348785 0.73017065
0. 0.26503369 0.39462975 1.17744947]
[3.04838491 2.04148692 2.52479498 2.70285523 2.69440784 0.72155003
0.26503369 0. 0.63433417 0.98227778]
[3.17512164 2.12199296 2.69795502 2.80250528 2.9218119 0.76253361
0.39462975 0.63433417 0. 1.40996914]
[2.23490958 1.33758728 1.66338404 1.90564543 1.79356246 0.90218114
1.17744947 0.98227778 1.40996914 0. ]]
# Visualizar con gráfico
plt.clf()=(10, 8))
plt.figure(figsize0:10, 0:10], annot=True, cmap="coolwarm", center=0, linewidths=0.5)
sns.heatmap(matrix_dist["Distancias euclídeas - Primeras 10 observaciones")
plt.title( plt.show()
Cluster no-jerárquico - K-medias
Aplicar K-medias
= [] # Almacenará la inercia (suma de las distancias cuadradas de cada punto al centroide de su cluster).
inercia
= range(1,6) # Se prueba k-medias con diferente cantidad de clusters
k_range
for k in k_range:
= KMeans(n_clusters=k, random_state=252)
kmeans
kmeans.fit(base2Scale) inercia.append(kmeans.inertia_)
KMeans(n_clusters=5, random_state=252)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=5, random_state=252)
# Graficar el método del codo
plt.clf()='o')
plt.plot(k_range, inercia, marker'Inercia y número de clusters - k-medias')
plt.title('Número de clusters')
plt.xlabel('Inercia')
plt.ylabel(True)
plt.grid( plt.show()
K-medias número óptimo de grupos(clusters)
# Se asignan 4 cluster
=KMeans(n_clusters=4, random_state=252)
kmeansOptimo
# Nuevo data frame que contiene los datos originales y sin escalar
= base.copy()
base3
# Se agrega la nueva columna que correponde al cluster al que pertenece la observación
'cluster'] = kmeansOptimo.fit_predict(base2Scale)
base3[
# Ver obervacinones y su respectivo cluster
10) base3.head(
mpg cylinders displacement horsepower weight acceleration modelYear \
0 18.0 8 307.0 130 3504 12.0 70
1 15.0 8 350.0 165 3693 11.5 70
2 18.0 8 318.0 150 3436 11.0 70
3 16.0 8 304.0 150 3433 12.0 70
4 17.0 8 302.0 140 3449 10.5 70
5 15.0 8 429.0 198 4341 10.0 70
6 14.0 8 454.0 220 4354 9.0 70
7 14.0 8 440.0 215 4312 8.5 70
8 14.0 8 455.0 225 4425 10.0 70
9 15.0 8 390.0 190 3850 8.5 70
origin carName cluster
0 1 chevrolet chevelle malibu 0
1 1 buick skylark 320 0
2 1 plymouth satellite 0
3 1 amc rebel sst 0
4 1 ford torino 0
5 1 ford galaxie 500 0
6 1 chevrolet impala 0
7 1 plymouth fury iii 0
8 1 pontiac catalina 0
9 1 amc ambassador dpl 0
10) base3.tail(
mpg cylinders displacement horsepower weight acceleration \
382 26.0 4 156.0 92 2585 14.5
383 22.0 6 232.0 112 2835 14.7
384 32.0 4 144.0 96 2665 13.9
385 36.0 4 135.0 84 2370 13.0
386 27.0 4 151.0 90 2950 17.3
387 27.0 4 140.0 86 2790 15.6
388 44.0 4 97.0 52 2130 24.6
389 32.0 4 135.0 84 2295 11.6
390 28.0 4 120.0 79 2625 18.6
391 31.0 4 119.0 82 2720 19.4
modelYear origin carName cluster
382 82 1 chrysler lebaron medallion 1
383 82 1 ford granada l 2
384 82 3 toyota celica gt 1
385 82 1 dodge charger 2.2 1
386 82 1 chevrolet camaro 1
387 82 1 ford mustang gl 1
388 82 2 vw pickup 3
389 82 1 dodge rampage 1
390 82 1 ford ranger 3
391 82 1 chevy s-10 3
print("Número de observaciones por cluster \n",base3['cluster'].value_counts().sort_index())
Número de observaciones por cluster
cluster
0 97
1 122
2 82
3 91
Name: count, dtype: int64
PCA y cluster no-jerárquico
Aplicar PCA
= make_pipeline(StandardScaler(), PCA())
acp acp.fit(base2)
Pipeline(steps=[('standardscaler', StandardScaler()), ('pca', PCA())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('standardscaler', StandardScaler()), ('pca', PCA())])
StandardScaler()
PCA()
# Extraer el modelo entrenado
= acp.named_steps['pca']
modelo_acp
print("Resumen del modelo: \nAutovalores:\n", modelo_acp.explained_variance_)
Resumen del modelo:
Autovalores:
[4.80051237 0.73049462 0.25912836 0.12549716 0.06333798 0.03637479]
print("\nPorcentaje de varianza explicada:\n", modelo_acp.explained_variance_ratio_)
Porcentaje de varianza explicada:
[0.79804436 0.12143852 0.04307789 0.02086284 0.0105294 0.006047 ]
print("\nVarianza acumulada:\n", np.cumsum(modelo_acp.explained_variance_ratio_))
Varianza acumulada:
[0.79804436 0.91948288 0.96256076 0.9834236 0.993953 1. ]
print("\nComponentes principales:\n", modelo_acp.components_)
Componentes principales:
[[-0.39897309 0.43061523 0.44353136 0.43412172 0.43010308 -0.29192568]
[-0.24483454 0.14831406 0.10849714 -0.1661584 0.28609545 0.8926523 ]
[ 0.85211071 0.40032254 0.29750498 0.02260028 0.12470286 0.09528071]
[ 0.2208091 -0.57631048 -0.11078321 0.67523694 0.34090709 0.18620982]
[ 0.0710939 -0.285904 0.00559368 -0.55858804 0.72661186 -0.27053257]
[-0.03119071 -0.46737615 0.83108055 -0.12431178 -0.27097508 0.03179437]]
# Gráfico de sedimentación
= np.cumsum(modelo_acp.explained_variance_ratio_)
prop_varianza_acum = range(1, len(prop_varianza_acum) + 1)
componentes
plt.clf()=(8, 5))
plt.figure(figsize'bo-', linewidth=2, markersize=8)
plt.plot(componentes, prop_varianza_acum, plt.xticks(componentes)
([<matplotlib.axis.XTick object at 0x000002805F08D1D0>, <matplotlib.axis.XTick object at 0x000002805F08F610>, <matplotlib.axis.XTick object at 0x000002805F08FD90>, <matplotlib.axis.XTick object at 0x000002805F0B8550>, <matplotlib.axis.XTick object at 0x000002805F0B8CD0>, <matplotlib.axis.XTick object at 0x000002805F0B9450>], [Text(1, 0, '1'), Text(2, 0, '2'), Text(3, 0, '3'), Text(4, 0, '4'), Text(5, 0, '5'), Text(6, 0, '6')])
'Porcentaje de varianza explicada acumulada')
plt.title('Componente principal')
plt.xlabel('Por. varianza acumulada')
plt.ylabel(0, 1.1) plt.ylim(
(0.0, 1.1)
True, linestyle='--', alpha=0.7)
plt.grid(
# Añadir etiquetas
for i, v in enumerate(prop_varianza_acum):
+ 0.03, f"{v:.0%}",
plt.text(componentes[i], v ='center', va='bottom')
ha
plt.tight_layout() plt.show()
# Cargas (loadings)
plt.clf() = plt.subplots(nrows=1, ncols=1, figsize=(4, 2))
fig, ax = modelo_acp.components_
ncomponentes ='coolwarm', aspect='auto')
plt.imshow(ncomponentes.T, cmaprange(len(base2.columns)), base2.columns) plt.yticks(
([<matplotlib.axis.YTick object at 0x000002805F0EE850>, <matplotlib.axis.YTick object at 0x000002805F120550>, <matplotlib.axis.YTick object at 0x000002805F120CD0>, <matplotlib.axis.YTick object at 0x000002805F121450>, <matplotlib.axis.YTick object at 0x000002805F121BD0>, <matplotlib.axis.YTick object at 0x000002805F018050>], [Text(0, 0, 'mpg'), Text(0, 1, 'cylinders'), Text(0, 2, 'displacement'), Text(0, 3, 'horsepower'), Text(0, 4, 'weight'), Text(0, 5, 'acceleration')])
range(len(base2.columns)), np.arange(modelo_acp.n_components_) + 1) plt.xticks(
([<matplotlib.axis.XTick object at 0x000002805F0EE0D0>, <matplotlib.axis.XTick object at 0x000002805F0182D0>, <matplotlib.axis.XTick object at 0x000002805F018A50>, <matplotlib.axis.XTick object at 0x000002805F0191D0>, <matplotlib.axis.XTick object at 0x000002805F019950>, <matplotlib.axis.XTick object at 0x000002805F01A0D0>], [Text(0, 0, '1'), Text(1, 0, '2'), Text(2, 0, '3'), Text(3, 0, '4'), Text(4, 0, '5'), Text(5, 0, '6')])
False)
plt.grid(;
plt.colorbar()'Mapa de calor - Cargas')
plt.title('Componente')
plt.xlabel('Variable')
plt.ylabel( plt.show()
# Puntuaciones (scores) y cargas (loadings)
= modelo_acp.transform(StandardScaler().fit_transform(base2))[:, :2]
scores = modelo_acp.components_.T[:, :2]
loadings
= 1.0 / (loadings.max() - loadings.min())
scale_factor = loadings * scale_factor * 7 # Factor de escala ajustable
loadings_scaled
plt.clf() =(12, 8))
plt.figure(figsize
# Graficar scores (observaciones)
0], scores[:, 1], alpha=0.5, label='Observaciones')
plt.scatter(scores[:,
# Graficar loadings (variables)
for i, (x, y) in enumerate(loadings_scaled):
0, 0, x, y, color='red', head_width=0.05, alpha=0.8)
plt.arrow(* 1.15, y * 1.15, base2.columns[i], color='darkred',
plt.text(x ='center', va='center', fontsize=12)
ha
0, color='gray', linestyle='--', alpha=0.5)
plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
plt.axvline(f'Componente 1 ({modelo_acp.explained_variance_ratio_[0]*100:.1f}%)', fontsize=12)
plt.xlabel(f'Componente 2 ({modelo_acp.explained_variance_ratio_[1]*100:.1f}%)', fontsize=12)
plt.ylabel('Puntuaciones (scores) de las observaciones"', fontsize=14)
plt.title(True, linestyle='--', alpha=0.3)
plt.grid(
plt.legend()
= max(np.abs(scores).max(), np.abs(loadings_scaled).max()) * 1.2
max_val -max_val, max_val) plt.xlim(
(-6.208291890681096, 6.208291890681096)
-max_val, max_val) plt.ylim(
(-6.208291890681096, 6.208291890681096)
plt.tight_layout() plt.show()
Gráficos cluster y PCA
########## Cluster, obervaciones y PC
= acp.transform(base2)
modelo_acp_transform
'PC1'] = modelo_acp_transform[:, 0]
base3['PC2'] = modelo_acp_transform[:, 1]
base3[
plt.clf() =(8, 6))
plt.figure(figsize= plt.scatter(base3['PC1'], base3['PC2'],
scatter =base3['cluster'], cmap='tab10', alpha=0.7)
c'Cluster, obervaciones y PC')
plt.title(f'Componente 1 ({modelo_acp.explained_variance_ratio_[0]*100:.1f}%)')
plt.xlabel(f'Componente 2 ({modelo_acp.explained_variance_ratio_[1]*100:.1f}%)')
plt.ylabel(True)
plt.grid(='Cluster') plt.colorbar(scatter, label
<matplotlib.colorbar.Colorbar object at 0x000002805F240F50>
plt.show()
############## Cluster, centriodes, obervaciones y PC
= kmeansOptimo.cluster_centers_
centroides # Proyectar los centroides al espacio PCA (usando el mismo modelo)
= modelo_acp.transform(centroides)
centroides_pca
plt.clf() =(8, 6))
plt.figure(figsize
= plt.scatter(base3['PC1'], base3['PC2'],
scatter =base3['cluster'], cmap='tab10', alpha=0.7)
c
# Centroides
0], centroides_pca[:, 1],
plt.scatter(centroides_pca[:, ='black', marker='X', s=200, label='Centroides')
c
'Cluster, centriodes, obervaciones y PC')
plt.title(f'Componente 1 ({modelo_acp.explained_variance_ratio_[0]*100:.1f}%)')
plt.xlabel(f'Componente 2 ({modelo_acp.explained_variance_ratio_[1]*100:.1f}%)')
plt.ylabel(True)
plt.grid(
plt.legend()='Cluster') plt.colorbar(scatter, label
<matplotlib.colorbar.Colorbar object at 0x000002805AEBDF90>
plt.show()
########### Cluster, centroides, puntuaciones, cargas y componentes
= modelo_acp.components_.T
cargas = base2.columns
nombres_variables
plt.clf() =(10, 8))
plt.figure(figsize
# Graficar observaciones
= plt.scatter(base3['PC1'], base3['PC2'],
scatter =base3['cluster'], cmap='tab10', alpha=0.7, label='Observaciones')
c
# Gráficar cargas
for i, var in enumerate(nombres_variables):
0, 0, cargas[i, 0]*3, cargas[i, 1]*3,
plt.arrow(='black', alpha=0.7, head_width=0.1)
color0]*3.2, cargas[i, 1]*3.2, var, color='black', ha='center', va='center')
plt.text(cargas[i,
# Gráficar Centroides
0], centroides_pca[:, 1],
plt.scatter(centroides_pca[:, ='black', marker='X', s=200)
c
'PC1')
plt.xlabel('PC2')
plt.ylabel('Cluster, centroides, puntuaciones, cargas y PC')
plt.title(True)
plt.grid(0, color='gray', linestyle='--', linewidth=0.5)
plt.axhline(0, color='gray', linestyle='--', linewidth=0.5)
plt.axvline(='Cluster') plt.colorbar(scatter, label
<matplotlib.colorbar.Colorbar object at 0x000002805A0E2E90>
f'Componente 1 ({modelo_acp.explained_variance_ratio_[0]*100:.1f}%)')
plt.xlabel(f'Componente 2 ({modelo_acp.explained_variance_ratio_[1]*100:.1f}%)')
plt.ylabel( plt.show()
############# Lista completa de observaciones y su respectivo cluster
print(base3[["carName", "modelYear", "cluster"]])
carName modelYear cluster
0 chevrolet chevelle malibu 70 0
1 buick skylark 320 70 0
2 plymouth satellite 70 0
3 amc rebel sst 70 0
4 ford torino 70 0
5 ford galaxie 500 70 0
6 chevrolet impala 70 0
7 plymouth fury iii 70 0
8 pontiac catalina 70 0
9 amc ambassador dpl 70 0
10 dodge challenger se 70 0
11 plymouth 'cuda 340 70 0
12 chevrolet monte carlo 70 0
13 buick estate wagon (sw) 70 0
14 toyota corona mark ii 70 1
15 plymouth duster 70 2
16 amc hornet 70 2
17 ford maverick 70 2
18 datsun pl510 70 1
19 volkswagen 1131 deluxe sedan 70 3
20 peugeot 504 70 1
21 audi 100 ls 70 1
22 saab 99e 70 1
23 bmw 2002 70 1
24 amc gremlin 70 2
25 ford f250 70 0
26 chevy c20 70 0
27 dodge d200 70 0
28 hi 1200d 70 0
29 datsun pl510 71 1
30 chevrolet vega 2300 71 1
31 toyota corona 71 1
32 amc gremlin 71 2
33 plymouth satellite custom 71 2
34 chevrolet chevelle malibu 71 2
35 ford torino 500 71 2
36 amc matador 71 2
37 chevrolet impala 71 0
38 pontiac catalina brougham 71 0
39 ford galaxie 500 71 0
40 plymouth fury iii 71 0
41 dodge monaco (sw) 71 0
42 ford country squire (sw) 71 0
43 pontiac safari (sw) 71 0
44 amc hornet sportabout (sw) 71 2
45 chevrolet vega (sw) 71 3
46 pontiac firebird 71 2
47 ford mustang 71 2
48 mercury capri 2000 71 1
49 opel 1900 71 1
50 peugeot 304 71 3
51 fiat 124b 71 1
52 toyota corolla 1200 71 3
53 datsun 1200 71 3
54 volkswagen model 111 71 3
55 plymouth cricket 71 3
56 toyota corona hardtop 72 1
57 dodge colt hardtop 72 1
58 volkswagen type 3 72 3
59 chevrolet vega 72 1
60 ford pinto runabout 72 1
61 chevrolet impala 72 0
62 pontiac catalina 72 0
63 plymouth fury iii 72 0
64 ford galaxie 500 72 0
65 amc ambassador sst 72 0
66 mercury marquis 72 0
67 buick lesabre custom 72 0
68 oldsmobile delta 88 royale 72 0
69 chrysler newport royal 72 0
70 mazda rx2 coupe 72 1
71 amc matador (sw) 72 0
72 chevrolet chevelle concours (sw) 72 0
73 ford gran torino (sw) 72 0
74 plymouth satellite custom (sw) 72 0
75 volvo 145e (sw) 72 1
76 volkswagen 411 (sw) 72 1
77 peugeot 504 (sw) 72 1
78 renault 12 (sw) 72 3
79 ford pinto (sw) 72 1
80 datsun 510 (sw) 72 1
81 toyouta corona mark ii (sw) 72 1
82 dodge colt (sw) 72 1
83 toyota corolla 1600 (sw) 72 1
84 buick century 350 73 0
85 amc matador 73 0
86 chevrolet malibu 73 0
87 ford gran torino 73 0
88 dodge coronet custom 73 0
89 mercury marquis brougham 73 0
90 chevrolet caprice classic 73 0
91 ford ltd 73 0
92 plymouth fury gran sedan 73 0
93 chrysler new yorker brougham 73 0
94 buick electra 225 custom 73 0
95 amc ambassador brougham 73 0
96 plymouth valiant 73 2
97 chevrolet nova custom 73 2
98 amc hornet 73 2
99 ford maverick 73 2
100 plymouth duster 73 2
101 volkswagen super beetle 73 3
102 chevrolet impala 73 0
103 ford country 73 0
104 plymouth custom suburb 73 0
105 oldsmobile vista cruiser 73 0
106 amc gremlin 73 2
107 toyota carina 73 1
108 chevrolet vega 73 3
109 datsun 610 73 1
110 maxda rx3 73 1
111 ford pinto 73 1
112 mercury capri v6 73 1
113 fiat 124 sport coupe 73 1
114 chevrolet monte carlo s 73 0
115 pontiac grand prix 73 0
116 fiat 128 73 3
117 opel manta 73 1
118 audi 100ls 73 1
119 volvo 144ea 73 1
120 dodge dart custom 73 0
121 saab 99le 73 1
122 toyota mark ii 73 2
123 oldsmobile omega 73 0
124 plymouth duster 74 2
125 amc hornet 74 2
126 chevrolet nova 74 2
127 datsun b210 74 3
128 ford pinto 74 1
129 toyota corolla 1200 74 3
130 chevrolet vega 74 1
131 chevrolet chevelle malibu classic 74 2
132 amc matador 74 2
133 plymouth satellite sebring 74 2
134 ford gran torino 74 0
135 buick century luxus (sw) 74 0
136 dodge coronet custom (sw) 74 0
137 ford gran torino (sw) 74 0
138 amc matador (sw) 74 0
139 audi fox 74 1
140 volkswagen dasher 74 1
141 opel manta 74 1
142 toyota corona 74 3
143 datsun 710 74 3
144 dodge colt 74 1
145 fiat 128 74 1
146 fiat 124 tc 74 1
147 honda civic 74 1
148 subaru 74 1
149 fiat x1.9 74 3
150 plymouth valiant custom 75 2
151 chevrolet nova 75 2
152 mercury monarch 75 2
153 ford maverick 75 2
154 pontiac catalina 75 0
155 chevrolet bel air 75 0
156 plymouth grand fury 75 0
157 ford ltd 75 0
158 buick century 75 2
159 chevroelt chevelle malibu 75 2
160 amc matador 75 2
161 plymouth fury 75 2
162 buick skyhawk 75 2
163 chevrolet monza 2+2 75 2
164 ford mustang ii 75 0
165 toyota corolla 75 1
166 ford pinto 75 1
167 amc gremlin 75 2
168 pontiac astro 75 1
169 toyota corona 75 1
170 volkswagen dasher 75 1
171 datsun 710 75 1
172 ford pinto 75 2
173 volkswagen rabbit 75 1
174 amc pacer 75 2
175 audi 100ls 75 1
176 peugeot 504 75 1
177 volvo 244dl 75 1
178 saab 99le 75 1
179 honda civic cvcc 75 3
180 fiat 131 76 1
181 opel 1900 76 1
182 capri ii 76 1
183 dodge colt 76 3
184 renault 12tl 76 1
185 chevrolet chevelle malibu classic 76 0
186 dodge coronet brougham 76 0
187 amc matador 76 0
188 ford gran torino 76 0
189 plymouth valiant 76 2
190 chevrolet nova 76 2
191 ford maverick 76 2
192 amc hornet 76 2
193 chevrolet chevette 76 3
194 chevrolet woody 76 3
195 vw rabbit 76 1
196 honda civic 76 3
197 dodge aspen se 76 2
198 ford granada ghia 76 2
199 pontiac ventura sj 76 2
200 amc pacer d/l 76 2
201 volkswagen rabbit 76 1
202 datsun b-210 76 3
203 toyota corolla 76 1
204 ford pinto 76 1
205 volvo 245 76 1
206 plymouth volare premier v8 76 0
207 peugeot 504 76 2
208 toyota mark ii 76 2
209 mercedes-benz 280s 76 2
210 cadillac seville 76 0
211 chevy c10 76 0
212 ford f108 76 0
213 dodge d100 76 0
214 honda accord cvcc 77 3
215 buick opel isuzu deluxe 77 1
216 renault 5 gtl 77 3
217 plymouth arrow gs 77 1
218 datsun f-10 hatchback 77 3
219 chevrolet caprice classic 77 0
220 oldsmobile cutlass supreme 77 2
221 dodge monaco brougham 77 0
222 mercury cougar brougham 77 0
223 chevrolet concours 77 2
224 buick skylark 77 2
225 plymouth volare custom 77 2
226 ford granada 77 2
227 pontiac grand prix lj 77 0
228 chevrolet monte carlo landau 77 0
229 chrysler cordoba 77 0
230 ford thunderbird 77 0
231 volkswagen rabbit custom 77 1
232 pontiac sunbird coupe 77 1
233 toyota corolla liftback 77 3
234 ford mustang ii 2+2 77 1
235 chevrolet chevette 77 3
236 dodge colt m/m 77 3
237 subaru dl 77 3
238 volkswagen dasher 77 1
239 datsun 810 77 2
240 bmw 320i 77 1
241 mazda rx-4 77 1
242 volkswagen rabbit custom diesel 78 3
243 ford fiesta 78 3
244 mazda glc deluxe 78 3
245 datsun b210 gx 78 3
246 honda civic cvcc 78 3
247 oldsmobile cutlass salon brougham 78 2
248 dodge diplomat 78 0
249 mercury monarch ghia 78 0
250 pontiac phoenix lj 78 2
251 chevrolet malibu 78 2
252 ford fairmont (auto) 78 2
253 ford fairmont (man) 78 1
254 plymouth volare 78 2
255 amc concord 78 2
256 buick century special 78 2
257 mercury zephyr 78 2
258 dodge aspen 78 2
259 amc concord d/l 78 2
260 chevrolet monte carlo landau 78 0
261 buick regal sport coupe (turbo) 78 0
262 ford futura 78 0
263 dodge magnum xe 78 0
264 chevrolet chevette 78 3
265 toyota corona 78 1
266 datsun 510 78 1
267 dodge omni 78 1
268 toyota celica gt liftback 78 1
269 plymouth sapporo 78 1
270 oldsmobile starfire sx 78 1
271 datsun 200-sx 78 1
272 audi 5000 78 1
273 volvo 264gl 78 2
274 saab 99gle 78 1
275 peugeot 604sl 78 2
276 volkswagen scirocco 78 1
277 honda accord lx 78 3
278 pontiac lemans v6 79 2
279 mercury zephyr 6 79 2
280 ford fairmont 4 79 1
281 amc concord dl 6 79 2
282 dodge aspen 6 79 2
283 chevrolet caprice classic 79 0
284 ford ltd landau 79 0
285 mercury grand marquis 79 0
286 dodge st. regis 79 0
287 buick estate wagon (sw) 79 0
288 ford country squire (sw) 79 0
289 chevrolet malibu classic (sw) 79 2
290 chrysler lebaron town @ country (sw) 79 0
291 vw rabbit custom 79 1
292 maxda glc deluxe 79 3
293 dodge colt hatchback custom 79 3
294 amc spirit dl 79 1
295 mercedes benz 300d 79 2
296 cadillac eldorado 79 2
297 peugeot 504 79 3
298 oldsmobile cutlass salon brougham 79 2
299 plymouth horizon 79 1
300 plymouth horizon tc3 79 3
301 datsun 210 79 3
302 fiat strada custom 79 3
303 buick skylark limited 79 1
304 chevrolet citation 79 1
305 oldsmobile omega brougham 79 1
306 pontiac phoenix 79 1
307 vw rabbit 80 3
308 toyota corolla tercel 80 3
309 chevrolet chevette 80 3
310 datsun 310 80 3
311 chevrolet citation 80 1
312 ford fairmont 80 1
313 amc concord 80 3
314 dodge aspen 80 2
315 audi 4000 80 3
316 toyota corona liftback 80 1
317 mazda 626 80 3
318 datsun 510 hatchback 80 1
319 toyota corolla 80 1
320 mazda glc 80 3
321 dodge colt 80 1
322 datsun 210 80 3
323 vw rabbit c (diesel) 80 3
324 vw dasher (diesel) 80 3
325 audi 5000s (diesel) 80 3
326 mercedes-benz 240d 80 3
327 honda civic 1500 gl 80 3
328 subaru dl 80 3
329 vokswagen rabbit 80 3
330 datsun 280-zx 80 1
331 mazda rx-7 gs 80 1
332 triumph tr7 coupe 80 1
333 honda accord 80 3
334 plymouth reliant 81 1
335 buick skylark 81 1
336 dodge aries wagon (sw) 81 1
337 chevrolet citation 81 1
338 plymouth reliant 81 1
339 toyota starlet 81 3
340 plymouth champ 81 3
341 honda civic 1300 81 3
342 subaru 81 3
343 datsun 210 mpg 81 3
344 toyota tercel 81 3
345 mazda glc 4 81 3
346 plymouth horizon 4 81 3
347 ford escort 4w 81 3
348 ford escort 2h 81 3
349 volkswagen jetta 81 1
350 honda prelude 81 1
351 toyota corolla 81 3
352 datsun 200sx 81 1
353 mazda 626 81 3
354 peugeot 505s turbo diesel 81 3
355 volvo diesel 81 3
356 toyota cressida 81 1
357 datsun 810 maxima 81 1
358 buick century 81 2
359 oldsmobile cutlass ls 81 2
360 ford granada gl 81 2
361 chrysler lebaron salon 81 2
362 chevrolet cavalier 82 3
363 chevrolet cavalier wagon 82 3
364 chevrolet cavalier 2-door 82 3
365 pontiac j2000 se hatchback 82 1
366 dodge aries se 82 1
367 pontiac phoenix 82 1
368 ford fairmont futura 82 1
369 volkswagen rabbit l 82 3
370 mazda glc custom l 82 3
371 mazda glc custom 82 3
372 plymouth horizon miser 82 3
373 mercury lynx l 82 3
374 nissan stanza xe 82 1
375 honda accord 82 3
376 toyota corolla 82 3
377 honda civic 82 3
378 honda civic (auto) 82 3
379 datsun 310 gx 82 3
380 buick century limited 82 2
381 oldsmobile cutlass ciera (diesel) 82 3
382 chrysler lebaron medallion 82 1
383 ford granada l 82 2
384 toyota celica gt 82 1
385 dodge charger 2.2 82 1
386 chevrolet camaro 82 1
387 ford mustang gl 82 1
388 vw pickup 82 3
389 dodge rampage 82 1
390 ford ranger 82 3
391 chevy s-10 82 3