#install.packages('skimr')
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(skimr)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
library(mltools)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ stringr 1.5.0
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ purrr 0.3.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::lift() masks caret::lift()
## ✖ tidyr::replace_na() masks mltools::replace_na()
## ✖ purrr::transpose() masks data.table::transpose()
#install.packages('gbm')
library(gbm)
## Loaded gbm 2.1.8.1
#install.packages('pROC')
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
A. Efectué un ACP (Análisis de Componentes Principales) usando la función PCA (…) del
paquete FactoMineR y dé una interpretación siguiendo los siguientes pasos:
1) Cargue la tabla de datos y ejecute un str (...), summary (...) y un dim (…),
2) Verifique la correcta lectura de los datos.
3) Elimine de la tabla de datos las variables categóricas.
4) Grafique el plano principal con plot (...) y con fviz pca ind (…).
5) Grafique el círculo de correlación con plot (...) y con fviz pca var (…).
6) Repita los ejercicios anteriores pero esta vez elimine individuos y variables mal representados (coseno cuadrado menor al 5%). ¿Cuáles individuos y variables se eliminaron?
7) En el círculo de correlación interprete las variables.
8) Interprete la formación de los clústeres basado en la sobreposición del círculo y el plano.
B. Ejecute un Clustering jerárquico con la agregación del Salto Máximo y Ward.
C. Interprete los resultados del ejercicio anterior para el caso de agregación de Ward usando gráficos tipo radar, use 4 clústeres
D. Construya y grafique un Clustering jerárquico sobre las componentes principales del ACP.
df_train <- read_csv("~/documents/r_projects/vehiculos_c2/datos/vehiculos.csv")
## Rows: 100 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): car type, fueltype, doornumber
## dbl (8): carlength, carheight, enginesize, boreratio, horsepower, citympg, h...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_train)
## # A tibble: 6 × 11
## `car type` fuelt…¹ doorn…² carle…³ carhe…⁴ engin…⁵ borer…⁶ horse…⁷ citympg
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 alfa-romero g… gas two 169. 48.8 130 3.47 111 21
## 2 alfa-romero s… gas two 169. 48.8 130 3.47 111 21
## 3 audi fox gas two 177. 53.1 136 3.19 110 19
## 4 audi 5000 gas four 193. 55.7 136 3.19 110 19
## 5 bmw 320i gas two 177. 54.3 108 3.5 101 23
## 6 bmw x3 gas four 177. 54.3 164 3.31 121 21
## # … with 2 more variables: highwaympg <dbl>, price <dbl>, and abbreviated
## # variable names ¹fueltype, ²doornumber, ³carlength, ⁴carheight, ⁵enginesize,
## # ⁶boreratio, ⁷horsepower
Se espera que el dataset contenga 11 variables y 100 registros.
cartype: Variable categórica que representa el tipo de vehículo.
fueltype: Variable categórica de 2 niveles. Describe el tipo de combustible que utiliza el carro: gasolina (denotada como gas) o diésel.
doornumber: Variable categórica de 2 niveles. Describe el número de puertas del carro: dos o cuatro.
carlength: Variable numérica. Indica el largo total del carro en pulgadas.
carheight: Variable numérica. Indica la altura total del carro en pulgadas.
enginesize: Variable numérica. Indica el volumen nominal del motor en pulgadas cúbicas.
boreratio: Variable num´erica. Indica la relaci´on entre el diámetro interno del cilindro y el recorrido del pist´on (desde la altura más baja hasta la altura más alta).
horsepower: Variable numérica. Indica la potencia del motor del carro en caballos de fuerza.
citympg: Variable numérica. Indica el rendimiento promedio de combustible del carro en la ciudad en millas por galón.
highwaympg: Variable numérica. Indica el rendimiento promedio de combustible del carro en pista en millas por galón.
price: Variable numérica. Indica el precio del carro en dólares.
dim(df_train)
## [1] 100 11
str(df_train)
## spc_tbl_ [100 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ car type : chr [1:100] "alfa-romero giulia" "alfa-romero stelvio" "audi fox" "audi 5000" ...
## $ fueltype : chr [1:100] "gas" "gas" "gas" "gas" ...
## $ doornumber: chr [1:100] "two" "two" "two" "four" ...
## $ carlength : num [1:100] 169 169 177 193 177 ...
## $ carheight : num [1:100] 48.8 48.8 53.1 55.7 54.3 54.3 55.7 55.7 53.7 52 ...
## $ enginesize: num [1:100] 130 130 136 136 108 164 164 209 209 90 ...
## $ boreratio : num [1:100] 3.47 3.47 3.19 3.19 3.5 3.31 3.31 3.62 3.62 3.03 ...
## $ horsepower: num [1:100] 111 111 110 110 101 121 121 182 182 70 ...
## $ citympg : num [1:100] 21 21 19 19 23 21 20 16 16 38 ...
## $ highwaympg: num [1:100] 27 27 25 25 29 28 25 22 22 43 ...
## $ price : num [1:100] 13495 16500 15250 18920 16430 ...
## - attr(*, "spec")=
## .. cols(
## .. `car type` = col_character(),
## .. fueltype = col_character(),
## .. doornumber = col_character(),
## .. carlength = col_double(),
## .. carheight = col_double(),
## .. enginesize = col_double(),
## .. boreratio = col_double(),
## .. horsepower = col_double(),
## .. citympg = col_double(),
## .. highwaympg = col_double(),
## .. price = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(df_train)
## car type fueltype doornumber carlength
## Length:100 Length:100 Length:100 Min. :144.6
## Class :character Class :character Class :character 1st Qu.:159.1
## Mode :character Mode :character Mode :character Median :171.2
## Mean :172.3
## 3rd Qu.:180.6
## Max. :208.1
## carheight enginesize boreratio horsepower
## Min. :47.80 Min. : 70.0 Min. :2.910 Min. : 52.0
## 1st Qu.:51.60 1st Qu.: 92.0 1st Qu.:3.030 1st Qu.: 69.0
## Median :53.90 Median :108.5 Median :3.290 Median : 92.0
## Mean :53.66 Mean :126.7 Mean :3.312 Mean :102.1
## 3rd Qu.:55.62 3rd Qu.:141.0 3rd Qu.:3.598 3rd Qu.:114.0
## Max. :59.80 Max. :326.0 Max. :3.940 Max. :262.0
## citympg highwaympg price
## Min. :13.00 Min. :16.00 Min. : 5118
## 1st Qu.:19.00 1st Qu.:25.00 1st Qu.: 7295
## Median :25.50 Median :31.00 Median : 9418
## Mean :25.63 Mean :31.01 Mean :12865
## 3rd Qu.:31.00 3rd Qu.:37.00 3rd Qu.:15544
## Max. :49.00 Max. :54.00 Max. :45400
glimpse(df_train)
## Rows: 100
## Columns: 11
## $ `car type` <chr> "alfa-romero giulia", "alfa-romero stelvio", "audi fox", "a…
## $ fueltype <chr> "gas", "gas", "gas", "gas", "gas", "gas", "gas", "gas", "ga…
## $ doornumber <chr> "two", "two", "two", "four", "two", "four", "four", "four",…
## $ carlength <dbl> 168.8, 168.8, 177.3, 192.7, 176.8, 176.8, 189.0, 189.0, 193…
## $ carheight <dbl> 48.8, 48.8, 53.1, 55.7, 54.3, 54.3, 55.7, 55.7, 53.7, 52.0,…
## $ enginesize <dbl> 130, 130, 136, 136, 108, 164, 164, 209, 209, 90, 90, 90, 90…
## $ boreratio <dbl> 3.47, 3.47, 3.19, 3.19, 3.50, 3.31, 3.31, 3.62, 3.62, 3.03,…
## $ horsepower <dbl> 111, 111, 110, 110, 101, 121, 121, 182, 182, 70, 70, 68, 68…
## $ citympg <dbl> 21, 21, 19, 19, 23, 21, 20, 16, 16, 38, 38, 37, 31, 24, 31,…
## $ highwaympg <dbl> 27, 27, 25, 25, 29, 28, 25, 22, 22, 43, 43, 41, 38, 30, 38,…
## $ price <dbl> 13495, 16500, 15250, 18920, 16430, 21105, 24565, 30760, 413…
skim(df_train)
| Name | df_train |
| Number of rows | 100 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| car type | 0 | 1 | 6 | 31 | 0 | 100 | 0 |
| fueltype | 0 | 1 | 3 | 6 | 0 | 2 | 0 |
| doornumber | 0 | 1 | 3 | 4 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| carlength | 0 | 1 | 172.30 | 13.43 | 144.60 | 159.10 | 171.20 | 180.60 | 208.10 | ▂▇▆▃▁ |
| carheight | 0 | 1 | 53.66 | 2.60 | 47.80 | 51.60 | 53.90 | 55.62 | 59.80 | ▂▆▇▆▂ |
| enginesize | 0 | 1 | 126.74 | 49.34 | 70.00 | 92.00 | 108.50 | 141.00 | 326.00 | ▇▃▁▁▁ |
| boreratio | 0 | 1 | 3.31 | 0.28 | 2.91 | 3.03 | 3.29 | 3.60 | 3.94 | ▇▆▅▅▂ |
| horsepower | 0 | 1 | 102.11 | 40.33 | 52.00 | 69.00 | 92.00 | 114.00 | 262.00 | ▇▅▂▁▁ |
| citympg | 0 | 1 | 25.63 | 6.89 | 13.00 | 19.00 | 25.50 | 31.00 | 49.00 | ▆▇▆▂▁ |
| highwaympg | 0 | 1 | 31.01 | 7.29 | 16.00 | 25.00 | 31.00 | 37.00 | 54.00 | ▃▇▇▂▁ |
| price | 0 | 1 | 12865.45 | 8746.25 | 5118.00 | 7295.00 | 9418.00 | 15543.75 | 45400.00 | ▇▃▁▁▁ |
Encontramos en la lectura de los datos:
Columnas: 3 variables categoricas y 8 numericas.
Filas: 100 registros. No hay valores nulos ni vacios.
Los datos leidos del archivo tienen la estructura esperada.
glimpse(df_train)
## Rows: 100
## Columns: 11
## $ `car type` <chr> "alfa-romero giulia", "alfa-romero stelvio", "audi fox", "a…
## $ fueltype <chr> "gas", "gas", "gas", "gas", "gas", "gas", "gas", "gas", "ga…
## $ doornumber <chr> "two", "two", "two", "four", "two", "four", "four", "four",…
## $ carlength <dbl> 168.8, 168.8, 177.3, 192.7, 176.8, 176.8, 189.0, 189.0, 193…
## $ carheight <dbl> 48.8, 48.8, 53.1, 55.7, 54.3, 54.3, 55.7, 55.7, 53.7, 52.0,…
## $ enginesize <dbl> 130, 130, 136, 136, 108, 164, 164, 209, 209, 90, 90, 90, 90…
## $ boreratio <dbl> 3.47, 3.47, 3.19, 3.19, 3.50, 3.31, 3.31, 3.62, 3.62, 3.03,…
## $ horsepower <dbl> 111, 111, 110, 110, 101, 121, 121, 182, 182, 70, 70, 68, 68…
## $ citympg <dbl> 21, 21, 19, 19, 23, 21, 20, 16, 16, 38, 38, 37, 31, 24, 31,…
## $ highwaympg <dbl> 27, 27, 25, 25, 29, 28, 25, 22, 22, 43, 43, 41, 38, 30, 38,…
## $ price <dbl> 13495, 16500, 15250, 18920, 16430, 21105, 24565, 30760, 413…
categorical_columns = c(
'`car type`',
'fueltype',
'doornumber'
)
numerical_columns = c(
'carlength',
'carheight',
'enginesize',
'boreratio',
'horsepower',
'citympg',
'highwaympg',
'price'
)
df.numerical = df_train[numerical_columns]
head(df.numerical)
## # A tibble: 6 × 8
## carlength carheight enginesize boreratio horsepower citympg highwaympg price
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 169. 48.8 130 3.47 111 21 27 13495
## 2 169. 48.8 130 3.47 111 21 27 16500
## 3 177. 53.1 136 3.19 110 19 25 15250
## 4 193. 55.7 136 3.19 110 19 25 18920
## 5 177. 54.3 108 3.5 101 23 29 16430
## 6 177. 54.3 164 3.31 121 21 28 21105
plot(df.numerical)
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
df.pca = PCA(df.numerical, scale.unit=TRUE, ncp=8, graph=T)
plot.PCA(df.pca, axes=c(1, 2), choix="ind", habillage='cos2')
fviz_pca(df.pca)
fviz_pca_ind(df.pca)
Grupos de variables correlacionadas:
1. citympg, highwaympg
2. price, enginesize, horsepower
Variables con poca correlación :
carheight
carlength
boreratio
plot.PCA(df.pca, axes=c(1, 2),
choix="var",
habillage='cos2'
)
fviz_pca_var(
df.pca,
col.var="cos2",
gradient.cols = c(
"#FC0011",
"#E7B800",
"#00AF0B"
),
repel=TRUE
)
## 6) Repita los ejercicios anteriores pero esta vez elimine individuos y variables mal representados (coseno cuadrado menor al 5%). ¿Cuáles individuos y variables se eliminaron?
Panigrahi, 2019. BioSakshat Free Study Materials. Capitulo 12. Obenido en https://biosakshat.github.io/pca.html
Siguiendo la linea de (Panigrahi, 2019) se toman los primeros dos componentes como significativos. Al tener un Eigenvalue > 1 y representar un 83% de la varianza.
En base a esta selección se procede a buscar individuos y variables mal representadas con valores de coseno cuadrado menores a 5% sobre esas dos dimensiones.
#### Individuos
Los individuos numero 71 y 75 no estan bien representados por los 2 primeros componentes principales.
Tienen un cos2 que suma menos del 5% sobre los dos primeros componentes.
En el circulo de correlación estos son los dos individuos mas cercanos al centro de las 2 dimensiones.
La variable que tiene menor representación en los primeros 2 componentes es boreratio con 63%.
La variable carheight tiene una representación muy baja en la primera dimensión pero tiene valores altos de cos2 en la segunda dimensión por lo que tampoco se puede considerar mal representada.
Al no tener ninguna variable con representación por debajo del 5% se conservan todas.
# Solo los primeros dos componentes principales tienen Eigenvalues significativos > 1 y representan una varianza del 83%.
get_eigenvalue(df.pca)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 5.48748179 68.5935224 68.59352
## Dim.2 1.22460753 15.3075942 83.90112
## Dim.3 0.53065664 6.6332080 90.53432
## Dim.4 0.40405295 5.0506619 95.58499
## Dim.5 0.14546425 1.8183031 97.40329
## Dim.6 0.10636021 1.3295026 98.73279
## Dim.7 0.08043172 1.0053965 99.73819
## Dim.8 0.02094491 0.2618114 100.00000
# Calidad de la representación
library("corrplot")
## corrplot 0.92 loaded
corrplot(t(df.pca$var$cos2), is.corr=FALSE)
fviz_cos2(df.pca, choice = "var")
filter(df.numerical, rowSums(df.pca$ind$cos2[,1:2])<=0.05)
## # A tibble: 2 × 8
## carlength carheight enginesize boreratio horsepower citympg highwaympg price
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 172 52.5 108 3.62 94 26 32 9960
## 2 174. 53 108 3.62 94 25 31 10198
rowSums(df.pca$var$cos2[,1:2])
## carlength carheight enginesize boreratio horsepower citympg highwaympg
## 0.8931511 0.9453902 0.8204011 0.6306634 0.9177442 0.8324384 0.8585659
## price
## 0.8137350
df.quality = filter(df.numerical, rowSums(df.pca$ind$cos2[,1:2])>0.05)
df.q.pca = PCA(df.quality, scale.unit=TRUE, ncp=8, graph=T)
plot.PCA(df.q.pca, axes=c(1, 2), choix="ind", habillage='cos2')
fviz_pca(df.q.pca)
plot.PCA(df.q.pca, axes=c(1, 2),
choix="var",
habillage='cos2'
)
fviz_pca_var(
df.q.pca,
col.var="cos2",
gradient.cols = c(
"#FC0011",
"#E7B800",
"#00AF0B"
),
repel=TRUE
)
Tenemos dos grupos de variables donde las variables en cada uno de los grupos se correlaciónan entre ellas pero tienen una correlación negativa respecto al otro grupo:
g.1 city mpg y highway mpg.
g.2 price, enginesize, carlenght, boreratio, y horsepower
Mientras que la variable carheight tiene muy poca correlación con las demas.
Las variables de g.1 y g.2 aportan casi toda la varianza que se explica por el componente principal 1. Con valores de cos2 por encima de abs(0.75)
Mientras que el PC2 recibe varianza de la variable carheight y en menor medida de carlenght y horsepower, mientras que todas las demas variables están por debajo de 25%.
library(NbClust)
NbClust(df.q.pca$ind$coord[,1:2], method='median')
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 1 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 6 proposed 9 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 5 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 0.3508 65.0758 2.6856 -5.5720 94.2233 99322.36 25487.5001 393.4454
## 3 1.3674 34.4285 7.9704 -11.8516 106.6956 196769.42 23004.1620 382.7385
## 4 1.1761 27.2450 45.7261 -13.6167 119.1210 308156.12 21793.7191 353.1129
## 5 1.4997 41.3604 6.1003 -11.0952 174.9717 272322.79 13656.0622 237.5549
## 6 0.7722 36.0866 1.7376 -12.3338 186.2949 349354.88 10820.4014 222.9317
## 7 1.5003 30.5935 4.9031 -13.8066 188.8387 463326.80 10325.8850 218.7992
## 8 0.1614 28.0250 105.6083 -14.7056 208.8253 493513.70 9090.7315 207.6130
## 9 17.5360 65.7586 4.8987 -7.6453 334.3251 173557.04 3420.0592 95.5234
## 10 0.5855 61.5147 1.1194 -8.3735 345.3594 191451.11 3200.1512 90.5399
## 11 1.7847 55.5410 3.8693 -9.4329 348.0459 225391.75 3151.0083 89.4027
## 12 0.2053 52.4789 31.5378 -10.0795 356.8074 245294.47 2928.0499 85.5959
## 13 5.3182 67.5800 11.2739 -7.8495 410.9447 165690.79 970.0173 62.6288
## 14 0.2649 70.6813 27.8408 -7.5447 437.4808 146578.85 743.9483 55.2948
## 15 7.2640 88.3104 8.5933 -5.5251 502.4924 86674.93 731.5245 41.5301
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 1.2339 1.6779 0.3129 0.8235 0.4780 1.6854 -4.4735 -0.3728 0.3804
## 3 1.4287 1.7248 0.3123 0.7355 0.4415 0.2833 25.2992 2.2999 0.3294
## 4 1.6923 1.8695 0.3113 0.6141 0.3693 0.6615 42.4686 0.5056 0.3119
## 5 3.0532 2.7789 0.3944 0.6503 0.2548 0.9335 4.8429 0.0702 0.3369
## 6 3.4120 2.9612 0.3888 0.6407 0.1998 1.4884 -4.2660 -0.3047 0.3121
## 7 3.5032 3.0172 0.3887 0.5904 0.2082 0.9455 3.8028 0.0568 0.2898
## 8 4.0084 3.1797 0.3813 0.7159 0.1702 0.4192 87.2737 1.3637 0.2812
## 9 11.5616 6.9109 0.3253 0.6357 0.3669 0.2767 10.4540 2.0908 0.2924
## 10 12.8225 7.2913 0.3241 0.6141 0.3644 2.2832 -0.5620 -0.2810 0.2782
## 11 13.1521 7.3840 0.3241 0.5824 0.3444 0.2102 15.0326 3.0065 0.2654
## 12 14.3018 7.7124 0.3235 0.5486 0.3375 0.5612 28.9353 0.7615 0.2547
## 13 18.9238 10.5407 0.3674 0.5065 0.3894 0.7470 11.1795 0.3288 0.2541
## 14 20.1544 11.9388 0.3466 0.4945 0.4058 0.5184 22.3006 0.8920 0.2494
## 15 30.2242 15.8957 0.3836 0.5296 0.4308 0.8200 6.5860 0.2125 0.2454
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 196.7227 0.6009 3.5420 0.1438 0.1020 0.0028 1.5575 1.7863 0.9857
## 3 127.5795 0.6007 7.8512 0.1460 0.1020 0.0030 1.3919 1.7534 0.4039
## 4 88.2782 0.5954 2.8016 0.1535 0.1020 0.0031 1.1726 1.6604 0.3304
## 5 47.5110 0.5425 1.7350 0.4367 0.1487 0.0030 1.0734 1.3348 0.2446
## 6 37.1553 0.5355 302.9351 0.4885 0.1417 0.0031 1.3501 1.2847 0.2065
## 7 31.2570 0.5323 3.0481 0.4958 0.1417 0.0031 1.3643 1.2629 0.1855
## 8 25.9516 0.5097 0.4288 0.5905 0.1417 0.0032 1.4389 1.2197 0.1872
## 9 10.6137 0.5373 0.7786 1.0663 0.1190 0.0033 1.4855 0.8473 0.1448
## 10 9.0540 0.5363 -35.4160 1.0731 0.1190 0.0034 1.3915 0.8164 0.0933
## 11 8.1275 0.5357 1.6226 1.0760 0.1190 0.0034 2.0108 0.7990 0.0758
## 12 7.1330 0.5341 0.2426 1.0848 0.1190 0.0034 1.9953 0.7680 0.0593
## 13 4.8176 0.5389 0.7298 1.1248 0.1556 0.0034 2.0422 0.6774 0.0298
## 14 3.9496 0.5235 0.5562 1.2155 0.1556 0.0035 2.0879 0.6302 0.0271
## 15 2.7687 0.4994 1.0374 1.3564 0.1958 0.0035 2.1535 0.5546 0.0251
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 -0.0027 -4016.9908 1.0000
## 3 -0.0307 -335.8023 0.1262
## 4 0.4140 117.4720 0.6041
## 5 0.3867 107.8309 0.9323
## 6 0.0445 279.0372 1.0000
## 7 0.3824 106.5810 0.9448
## 8 0.3756 104.7312 0.2595
## 9 -0.3258 -16.2785 0.1860
## 10 -0.7431 -2.3458 1.0000
## 11 -0.3258 -16.2785 0.1062
## 12 0.2865 92.1310 0.4706
## 13 0.2646 91.7349 0.7210
## 14 0.1977 97.3833 0.4165
## 15 0.2454 92.2265 0.8092
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 9.000 15.0000 9.0000 15.0000 9.0000 9.0 5.000
## Value_Index 17.536 88.3104 100.7096 -5.5251 125.4998 337850.7 8137.657
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 9.0000 15.0000 9.0000 4.0000 14.0000 2.000 2.0000
## Value_Index 107.1062 10.0698 -3.3508 0.3113 0.4945 0.478 1.6854
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 4.0000 2.0000 2.0000 3.0000 2.0000 7.0000 2.0000
## Value_Index 42.4686 -0.3728 0.3804 69.1432 0.6009 3.0481 0.1438
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 15.0000 0 5.0000 0 15.0000
## Value_Index 0.1958 0 1.0734 0 0.0251
##
## $Best.partition
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
## 1 1 1 2 2 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
## 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
## 1 1 2 2 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
fviz_nbclust(df.q.pca$ind$coord[,1:2], FUNcluster=kmeans, method="gap_stat", k.max = 8)
ec = eclust(df.q.pca$ind$coord[,1:2], "kmeans", hc_metric="eucliden",k=9)
fviz_pca(
df.q.pca,
col.ind = ec$cluster,
gradient.cols = c( 'magenta', "pink", "red", "orange", "yellow", "green","blue", "cyan", "gray"),
)
df.hclust.ward <- eclust(df.q.pca$ind$coord[,1:2], FUNcluster = "hclust", hc_metric = "euclidean", hc_method = "ward.D2", k=9)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.
fviz_dend(df.hclust.ward , rect = TRUE)
fviz_cluster(df.hclust.ward)
df.hclust.salto <- eclust(df.q.pca$ind$coord[,1:2], FUNcluster = "hclust", hc_metric = "euclidean", hc_method = "complete", k=9)
fviz_dend(df.hclust.salto , rect = TRUE)
fviz_cluster(df.hclust.salto)
El cluster heriarquico nos da agrupaciones muy similares a las que obtenemos con el kmeans. Las diferencias se dan por la asignación de algunos valores que en ambos casos se encuentran en los extremos de los clusters. Pero la mayoria de los registros son agrupados en los mismos grupos usando ambos metodos.
df.hclust.ward <- eclust(df.q.pca$ind$coord[,1:2], FUNcluster = "hclust", hc_metric = "euclidean", hc_method = "ward.D2", k=4)
fviz_dend(df.hclust.ward , rect = TRUE)
fviz_cluster(df.hclust.ward)
# k-means
fviz_cluster(ec)
groups = df.hclust.ward$clust
values = df.q.pca$ind$dist
df2 = data.frame(t(rbind(groups, values)))
df2.colnames <- c("group", "variable.value")
head(df2)
## groups values
## 1 1 2.174857
## 2 1 2.211726
## 3 1 1.446202
## 4 2 2.277218
## 5 1 1.095811
## 6 1 1.555551
library(ggplot2)
ggplot(df2,aes(y = values, x = groups, group = groups, colour = -1*values)) + coord_polar() + geom_point() + geom_path() + labs(x = NULL)
fviz_pca(
df.q.pca,
col.ind = df.hclust.ward$clust,
gradient.cols = c( 'magenta', "pink", "red", "orange", "yellow", "green","blue", "cyan", "gray"),
)