# Importar librerias
library(tidyverse)
library(stats)
library(ppcor)
library(ggplot2)
library(GGally)
library(corrplot)
library(tseries)
library(lattice)
library(pacman)
library(MASS)
library(ca)
library(FactoMineR)
library(vegan)
library(gplots)
library(vcd)
library(factoextra)
library(DandEFA)
library(polycor)
library(psych)
library(ade4)
library(GPArotation)
library(reticulate)
# py_install("factor-analyzer")
# py_module_available("factor_analyzer") # Debe devolver TRUE
options(scipen=999) # Eliminar la notación científica
options(digits = 4) # Número de decimales
Análisis Factorial Exploratorio
Descripción de la base de datos
Título: Calidad del Vino
Se crearon dos conjuntos de datos utilizando muestras de vino tinto y blanco. Los dos conjuntos de datos están relacionados con las variantes tinto y blanco del vino portugués “Vinho Verde”. Se utilizará vino blanco para el análisis.
Número de observaciones vino blanco: 4898.
Número de variables: 11 + variable de calidad (ordinal). Valores de perdidos: Ninguno.
Variables de entrada (basadas en pruebas fisicoquímicas):
1 - Acidez fija
2 - Acidez volátil
3 - Ácido cítrico
4 - Azúcar residual
5 - Cloruros
6 - Dióxido de azufre libre
7 - Dióxido de azufre total
8 - Densidad
9 - pH
10 - Sulfatos
11 - Alcohol
12 - Calidad (puntuación entre 0 y 10)
Aplicación R
Exploración inicial R
<- file.choose()
ruta <- read.csv(ruta, header=TRUE, sep=";")
wine_white
# estructura
str(wine_white)
'data.frame': 4898 obs. of 12 variables:
$ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
$ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
$ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
$ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
$ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
$ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
$ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
$ density : num 1.001 0.994 0.995 0.996 0.996 ...
$ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
$ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
$ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
$ quality : int 6 6 6 6 6 6 6 6 6 6 ...
head(wine_white)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
1 7.0 0.27 0.36 20.7 0.045
2 6.3 0.30 0.34 1.6 0.049
3 8.1 0.28 0.40 6.9 0.050
4 7.2 0.23 0.32 8.5 0.058
5 7.2 0.23 0.32 8.5 0.058
6 8.1 0.28 0.40 6.9 0.050
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
1 45 170 1.0010 3.00 0.45 8.8
2 14 132 0.9940 3.30 0.49 9.5
3 30 97 0.9951 3.26 0.44 10.1
4 47 186 0.9956 3.19 0.40 9.9
5 47 186 0.9956 3.19 0.40 9.9
6 30 97 0.9951 3.26 0.44 10.1
quality
1 6
2 6
3 6
4 6
5 6
6 6
tail(wine_white)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
4893 6.5 0.23 0.38 1.3 0.032
4894 6.2 0.21 0.29 1.6 0.039
4895 6.6 0.32 0.36 8.0 0.047
4896 6.5 0.24 0.19 1.2 0.041
4897 5.5 0.29 0.30 1.1 0.022
4898 6.0 0.21 0.38 0.8 0.020
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
4893 29 112 0.9930 3.29 0.54 9.7
4894 24 92 0.9911 3.27 0.50 11.2
4895 57 168 0.9949 3.15 0.46 9.6
4896 30 111 0.9925 2.99 0.46 9.4
4897 20 110 0.9887 3.34 0.38 12.8
4898 22 98 0.9894 3.26 0.32 11.8
quality
4893 5
4894 6
4895 5
4896 6
4897 7
4898 6
# Se omite la variable "quality"
$quality <- NULL
wine_white
head(wine_white)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
1 7.0 0.27 0.36 20.7 0.045
2 6.3 0.30 0.34 1.6 0.049
3 8.1 0.28 0.40 6.9 0.050
4 7.2 0.23 0.32 8.5 0.058
5 7.2 0.23 0.32 8.5 0.058
6 8.1 0.28 0.40 6.9 0.050
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
1 45 170 1.0010 3.00 0.45 8.8
2 14 132 0.9940 3.30 0.49 9.5
3 30 97 0.9951 3.26 0.44 10.1
4 47 186 0.9956 3.19 0.40 9.9
5 47 186 0.9956 3.19 0.40 9.9
6 30 97 0.9951 3.26 0.44 10.1
tail(wine_white)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
4893 6.5 0.23 0.38 1.3 0.032
4894 6.2 0.21 0.29 1.6 0.039
4895 6.6 0.32 0.36 8.0 0.047
4896 6.5 0.24 0.19 1.2 0.041
4897 5.5 0.29 0.30 1.1 0.022
4898 6.0 0.21 0.38 0.8 0.020
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
4893 29 112 0.9930 3.29 0.54 9.7
4894 24 92 0.9911 3.27 0.50 11.2
4895 57 168 0.9949 3.15 0.46 9.6
4896 30 111 0.9925 2.99 0.46 9.4
4897 20 110 0.9887 3.34 0.38 12.8
4898 22 98 0.9894 3.26 0.32 11.8
# resumen
summary(wine_white)
fixed.acidity volatile.acidity citric.acid residual.sugar
Min. : 3.80 Min. :0.080 Min. :0.000 Min. : 0.60
1st Qu.: 6.30 1st Qu.:0.210 1st Qu.:0.270 1st Qu.: 1.70
Median : 6.80 Median :0.260 Median :0.320 Median : 5.20
Mean : 6.86 Mean :0.278 Mean :0.334 Mean : 6.39
3rd Qu.: 7.30 3rd Qu.:0.320 3rd Qu.:0.390 3rd Qu.: 9.90
Max. :14.20 Max. :1.100 Max. :1.660 Max. :65.80
chlorides free.sulfur.dioxide total.sulfur.dioxide density
Min. :0.0090 Min. : 2.0 Min. : 9 Min. :0.987
1st Qu.:0.0360 1st Qu.: 23.0 1st Qu.:108 1st Qu.:0.992
Median :0.0430 Median : 34.0 Median :134 Median :0.994
Mean :0.0458 Mean : 35.3 Mean :138 Mean :0.994
3rd Qu.:0.0500 3rd Qu.: 46.0 3rd Qu.:167 3rd Qu.:0.996
Max. :0.3460 Max. :289.0 Max. :440 Max. :1.039
pH sulphates alcohol
Min. :2.72 Min. :0.22 Min. : 8.0
1st Qu.:3.09 1st Qu.:0.41 1st Qu.: 9.5
Median :3.18 Median :0.47 Median :10.4
Mean :3.19 Mean :0.49 Mean :10.5
3rd Qu.:3.28 3rd Qu.:0.55 3rd Qu.:11.4
Max. :3.82 Max. :1.08 Max. :14.2
describe(wine_white)
vars n mean sd median trimmed mad min max
fixed.acidity 1 4898 6.85 0.84 6.80 6.82 0.74 3.80 14.20
volatile.acidity 2 4898 0.28 0.10 0.26 0.27 0.09 0.08 1.10
citric.acid 3 4898 0.33 0.12 0.32 0.33 0.09 0.00 1.66
residual.sugar 4 4898 6.39 5.07 5.20 5.80 5.34 0.60 65.80
chlorides 5 4898 0.05 0.02 0.04 0.04 0.01 0.01 0.35
free.sulfur.dioxide 6 4898 35.31 17.01 34.00 34.36 16.31 2.00 289.00
total.sulfur.dioxide 7 4898 138.36 42.50 134.00 136.96 43.00 9.00 440.00
density 8 4898 0.99 0.00 0.99 0.99 0.00 0.99 1.04
pH 9 4898 3.19 0.15 3.18 3.18 0.15 2.72 3.82
sulphates 10 4898 0.49 0.11 0.47 0.48 0.10 0.22 1.08
alcohol 11 4898 10.51 1.23 10.40 10.43 1.48 8.00 14.20
range skew kurtosis se
fixed.acidity 10.40 0.65 2.17 0.01
volatile.acidity 1.02 1.58 5.08 0.00
citric.acid 1.66 1.28 6.16 0.00
residual.sugar 65.20 1.08 3.46 0.07
chlorides 0.34 5.02 37.51 0.00
free.sulfur.dioxide 287.00 1.41 11.45 0.24
total.sulfur.dioxide 431.00 0.39 0.57 0.61
density 0.05 0.98 9.78 0.00
pH 1.10 0.46 0.53 0.00
sulphates 0.86 0.98 1.59 0.00
alcohol 6.20 0.49 -0.70 0.02
#correlaciones
<- cor(wine_white)
cor_ww cor_ww
fixed.acidity volatile.acidity citric.acid residual.sugar
fixed.acidity 1.00000 -0.02270 0.28918 0.08902
volatile.acidity -0.02270 1.00000 -0.14947 0.06429
citric.acid 0.28918 -0.14947 1.00000 0.09421
residual.sugar 0.08902 0.06429 0.09421 1.00000
chlorides 0.02309 0.07051 0.11436 0.08868
free.sulfur.dioxide -0.04940 -0.09701 0.09408 0.29910
total.sulfur.dioxide 0.09107 0.08926 0.12113 0.40144
density 0.26533 0.02711 0.14950 0.83897
pH -0.42586 -0.03192 -0.16375 -0.19413
sulphates -0.01714 -0.03573 0.06233 -0.02666
alcohol -0.12088 0.06772 -0.07573 -0.45063
chlorides free.sulfur.dioxide total.sulfur.dioxide
fixed.acidity 0.02309 -0.0493959 0.091070
volatile.acidity 0.07051 -0.0970119 0.089261
citric.acid 0.11436 0.0940772 0.121131
residual.sugar 0.08868 0.2990984 0.401439
chlorides 1.00000 0.1013924 0.198910
free.sulfur.dioxide 0.10139 1.0000000 0.615501
total.sulfur.dioxide 0.19891 0.6155010 1.000000
density 0.25721 0.2942104 0.529881
pH -0.09044 -0.0006178 0.002321
sulphates 0.01676 0.0592172 0.134562
alcohol -0.36019 -0.2501039 -0.448892
density pH sulphates alcohol
fixed.acidity 0.26533 -0.4258583 -0.01714 -0.12088
volatile.acidity 0.02711 -0.0319154 -0.03573 0.06772
citric.acid 0.14950 -0.1637482 0.06233 -0.07573
residual.sugar 0.83897 -0.1941335 -0.02666 -0.45063
chlorides 0.25721 -0.0904395 0.01676 -0.36019
free.sulfur.dioxide 0.29421 -0.0006178 0.05922 -0.25010
total.sulfur.dioxide 0.52988 0.0023210 0.13456 -0.44889
density 1.00000 -0.0935915 0.07449 -0.78014
pH -0.09359 1.0000000 0.15595 0.12143
sulphates 0.07449 0.1559515 1.00000 -0.01743
alcohol -0.78014 0.1214321 -0.01743 1.00000
# mapa de calor correlacones
corrplot(
cor_ww,method = "color",
type = "upper",
tl.cex = 0.8,
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.7,
mar = c(1, 1, 2, 1),
title = "Mapa de calor - Matriz de correlaciones"
)
# Histogramas
hist(wine_white$fixed.acidity, col="skyblue4", main="Histograma de 'fixed acidity'", xlab="fixed acidity", ylab="Fracuencia")
hist(wine_white$volatile.acidity, col="skyblue4", main="Histograma de 'volatile acidity'", xlab="volatile acidity", ylab="Fracuencia")
hist(wine_white$citric.acid, col="skyblue4", main="Histograma de 'citric acid'", xlab="citric acid", ylab="Fracuencia")
hist(wine_white$residual.sugar, col="skyblue4", main="Histograma de 'residual sugar'", xlab="residual sugar", ylab="Fracuencia")
hist(wine_white$chlorides, col="skyblue4", main="Histograma de 'chlorides'", xlab="chlorides", ylab="Fracuencia")
hist(wine_white$free.sulfur.dioxide, col="skyblue4", main="Histograma de 'free sulfur dioxide'", xlab="free sulfur dioxide", ylab="Fracuencia")
hist(wine_white$total.sulfur.dioxide, col="skyblue4", main="Histograma de 'total sulfur dioxide'", xlab="total sulfur dioxide", ylab="Fracuencia")
hist(wine_white$density, col="skyblue4", main="Histograma de 'density'", xlab="density", ylab="Fracuencia")
hist(wine_white$pH, col="skyblue4", main="Histograma de 'pH'", xlab="pH", ylab="Fracuencia")
hist(wine_white$sulphates, col="skyblue4", main="Histograma de 'sulphates'", xlab="sulphates", ylab="Fracuencia")
hist(wine_white$alcohol, col="skyblue4", main="Histograma de 'alcohol'", xlab="alcohol", ylab="Fracuencia")
Datos atípicos
Detección de datos atípicos
<- mahalanobis(wine_white, colMeans(wine_white), cov(wine_white))
mahalanobis_dist
# Valor crítico para identificar outliers (usando chi-cuadrado)
<- ncol(wine_white) # grados de libertad (número de variables)
gl <- 0.05 # Nivel de significancia
alpha <- qchisq(1 - alpha, df = gl)
valor_critico
#ver valor crítico valor_critico
[1] 19.68
<- which(mahalanobis_dist > valor_critico) # Identificar outliers
atipicos
head(mahalanobis_dist) #ver distancias
[1] 9.264 6.682 8.294 3.224 3.224 8.294
length(atipicos)
[1] 395
Limpieza de datos atípicos
# nuevo data frame sin los datos atipicos
<- wine_white[-atipicos, ]
wine_w
# comprobar:
nrow(wine_white)
[1] 4898
nrow(wine_w)
[1] 4503
nrow(wine_white) - nrow(wine_w) # filas eliminadas = 395
[1] 395
Estandarizar los datos
<- scale(wine_w)
wine_scale
head(wine_scale)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
1 0.1967 -0.00192 0.3026 2.9009 0.1460
2 -0.6903 0.35094 0.1093 -0.9757 0.4777
3 1.5906 0.11570 0.6891 0.1000 0.5606
4 0.4502 -0.47240 -0.0840 0.4248 1.2241
5 0.4502 -0.47240 -0.0840 0.4248 1.2241
6 1.5906 0.11570 0.6891 0.1000 0.5606
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
1 0.6520 0.7906 2.429378 -1.304412 -0.33888 -1.4223
2 -1.3633 -0.1403 0.009714 0.770269 0.04105 -0.8475
3 -0.3231 -0.9977 0.389947 0.493645 -0.43386 -0.3549
4 0.7820 1.1825 0.562780 0.009553 -0.81379 -0.5191
5 0.7820 1.1825 0.562780 0.009553 -0.81379 -0.5191
6 -0.3231 -0.9977 0.389947 0.493645 -0.43386 -0.3549
tail(wine_scale)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
4893 -0.4368 -0.4724 0.4958 -1.0366 -0.9321
4894 -0.8170 -0.7076 -0.3739 -0.9757 -0.3516
4895 -0.3101 0.5862 0.3026 0.3233 0.3118
4896 -0.4368 -0.3548 -1.3403 -1.0569 -0.1858
4897 -1.7040 0.2333 -0.2773 -1.0772 -1.7615
4898 -1.0704 -0.7076 0.4958 -1.1380 -1.9273
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
4893 -0.3881 -0.6303 -0.3429 0.7011 0.5160 -0.6833
4894 -0.7132 -1.1202 -0.9789 0.5628 0.1360 0.5483
4895 1.4321 0.7416 0.3208 -0.2671 -0.2439 -0.7654
4896 -0.3231 -0.6547 -0.4950 -1.3736 -0.2439 -0.9297
4897 -0.9732 -0.6792 -1.8258 1.0469 -1.0038 1.8620
4898 -0.8432 -0.9732 -1.5769 0.4936 -1.5737 1.0409
describe(wine_scale)
vars n mean sd median trimmed mad min max range
fixed.acidity 1 4503 0 1 -0.06 -0.03 0.94 -3.10 4.00 7.10
volatile.acidity 2 4503 0 1 -0.12 -0.08 0.87 -2.24 4.23 6.47
citric.acid 3 4503 0 1 -0.18 -0.06 0.72 -3.18 3.97 7.15
residual.sugar 4 4503 0 1 -0.22 -0.11 1.11 -1.18 3.47 4.65
chlorides 5 4503 0 1 -0.02 -0.06 0.86 -2.59 6.86 9.45
free.sulfur.dioxide 6 4503 0 1 -0.06 -0.04 1.06 -2.14 3.97 6.11
total.sulfur.dioxide 7 4503 0 1 -0.09 -0.03 1.05 -2.91 3.29 6.20
density 8 4503 0 1 -0.09 -0.03 1.11 -2.37 2.91 5.29
pH 9 4503 0 1 -0.06 -0.03 0.92 -2.76 4.02 6.78
sulphates 10 4503 0 1 -0.15 -0.07 0.99 -2.52 3.94 6.46
alcohol 11 4503 0 1 -0.11 -0.06 1.10 -1.75 3.01 4.76
skew kurtosis se
fixed.acidity 0.35 0.35 0.01
volatile.acidity 0.79 0.93 0.01
citric.acid 0.69 1.77 0.01
residual.sugar 0.71 -0.53 0.01
chlorides 1.41 6.07 0.01
free.sulfur.dioxide 0.37 -0.22 0.01
total.sulfur.dioxide 0.28 -0.29 0.01
density 0.26 -0.80 0.01
pH 0.36 0.21 0.01
sulphates 0.67 0.42 0.01
alcohol 0.44 -0.75 0.01
Prueba de supuestos
Normalidad
# Ho: Los datos siguen una distribución normal
# Ha: Los datos no siguen una distribución normal
jarque.bera.test(wine_w$fixed.acidity)
Jarque Bera Test
data: wine_w$fixed.acidity
X-squared = 114, df = 2, p-value <0.0000000000000002
jarque.bera.test(wine_w$volatile.acidity)
Jarque Bera Test
data: wine_w$volatile.acidity
X-squared = 636, df = 2, p-value <0.0000000000000002
jarque.bera.test(wine_w$citric.acid)
Jarque Bera Test
data: wine_w$citric.acid
X-squared = 945, df = 2, p-value <0.0000000000000002
jarque.bera.test(wine_w$residual.sugar)
Jarque Bera Test
data: wine_w$residual.sugar
X-squared = 434, df = 2, p-value <0.0000000000000002
jarque.bera.test(wine_w$chlorides)
Jarque Bera Test
data: wine_w$chlorides
X-squared = 8412, df = 2, p-value <0.0000000000000002
jarque.bera.test(wine_w$free.sulfur.dioxide)
Jarque Bera Test
data: wine_w$free.sulfur.dioxide
X-squared = 114, df = 2, p-value <0.0000000000000002
jarque.bera.test(wine_w$total.sulfur.dioxide)
Jarque Bera Test
data: wine_w$total.sulfur.dioxide
X-squared = 77, df = 2, p-value <0.0000000000000002
jarque.bera.test(wine_w$density)
Jarque Bera Test
data: wine_w$density
X-squared = 169, df = 2, p-value <0.0000000000000002
jarque.bera.test(wine_w$pH)
Jarque Bera Test
data: wine_w$pH
X-squared = 103, df = 2, p-value <0.0000000000000002
jarque.bera.test(wine_w$sulphates)
Jarque Bera Test
data: wine_w$sulphates
X-squared = 372, df = 2, p-value <0.0000000000000002
jarque.bera.test(wine_w$alcohol)
Jarque Bera Test
data: wine_w$alcohol
X-squared = 253, df = 2, p-value <0.0000000000000002
Prueba de Bartlett
# Ho: La matriz de correlaciones es la matriz identidad
# Ha: La matriz de correlaciones no es la matriz identidad
bartlett.test(wine_w)
Bartlett test of homogeneity of variances
data: wine_w
Bartlett's K-squared = 356558, df = 10, p-value <0.0000000000000002
Prueba KMO
KMO(wine_w)
Kaiser-Meyer-Olkin factor adequacy
Call: KMO(r = wine_w)
Overall MSA = 0.37
MSA for each item =
fixed.acidity volatile.acidity citric.acid
0.13 0.29 0.72
residual.sugar chlorides free.sulfur.dioxide
0.32 0.82 0.61
total.sulfur.dioxide density pH
0.68 0.40 0.12
sulphates alcohol
0.19 0.36
Análisis Factorial Exploratorio (EFA)
# modelo de ejes principales
<- fa(wine_scale, nfactors = 5, rotate = "none", fm="paf") mAF
Comunalidades
$communalities mAF
fixed.acidity volatile.acidity citric.acid
0.68190 0.27186 0.22673
residual.sugar chlorides free.sulfur.dioxide
0.99500 0.33062 0.46616
total.sulfur.dioxide density pH
0.99500 0.97629 0.31232
sulphates alcohol
0.07167 0.81653
# ordenadas de manera decreciente
sort(mAF$communalities,decreasing = T)
residual.sugar total.sulfur.dioxide density
0.99500 0.99500 0.97629
alcohol fixed.acidity free.sulfur.dioxide
0.81653 0.68190 0.46616
chlorides pH volatile.acidity
0.33062 0.31232 0.27186
citric.acid sulphates
0.22673 0.07167
Unicidades
$uniquenesses mAF
fixed.acidity volatile.acidity citric.acid
0.318134 0.728104 0.773289
residual.sugar chlorides free.sulfur.dioxide
-0.005115 0.669384 0.533832
total.sulfur.dioxide density pH
-0.006996 0.023716 0.687666
sulphates alcohol
0.928363 0.183483
# ordenadas de manera decreciente
sort(mAF$uniquenesses,decreasing = T)
sulphates citric.acid volatile.acidity
0.928363 0.773289 0.728104
pH chlorides free.sulfur.dioxide
0.687666 0.669384 0.533832
fixed.acidity alcohol density
0.318134 0.183483 0.023716
residual.sugar total.sulfur.dioxide
-0.005115 -0.006996
Determinar número de factores
<- eigen(cor(wine_scale))
eigen_var_vect
# Un criterio para determinar el número de factores es con Kaiser
# Autovalores mayores a 1 al tratarse de la matriz de correlaciones
$values eigen_var_vect
[1] 3.46163 1.57425 1.23190 1.05864 0.95322 0.75338 0.72039 0.57133 0.38898
[10] 0.27420 0.01208
# Gráfico de sedimentación
plot(eigen_var_vect$values,type="b",pch=20,col="dodgerblue4", main="Gráfico de sedimentación", xlab="Factores", ylab="Eigenvalores")
abline(h=1,lty=3,col="firebrick3")
# Gráficos de sedimentación FA y PC
# 1
scree(cor(wine_scale), main="Gráfico de sedimentación FA y PC")
# 2
fa.parallel(cor(wine_scale),main="Gráfico de sedimentación FA y PC", ylab="Eigenvalores")
Parallel analysis suggests that the number of factors = 5 and the number of components = 2
Cargas factoriales
$loadings mAF
Loadings:
MR1 MR2 MR3 MR4 MR5
fixed.acidity 0.217 -0.732 0.296
volatile.acidity 0.241 -0.454
citric.acid 0.155 -0.280 0.249 0.247
residual.sugar 0.805 -0.309 0.489 0.138
chlorides 0.459 -0.106 -0.278 -0.175
free.sulfur.dioxide 0.491 0.257 0.334 0.129 0.175
total.sulfur.dioxide 0.754 0.350 0.531 -0.174
density 0.954 -0.226
pH -0.155 0.494 -0.150 0.130
sulphates 0.144 -0.153 0.106
alcohol -0.788 0.220 0.379
MR1 MR2 MR3 MR4 MR5
SS loadings 3.303 1.086 0.765 0.592 0.419
Proportion Var 0.300 0.099 0.070 0.054 0.038
Cumulative Var 0.300 0.399 0.469 0.522 0.561
<- mAF$loadings*mAF$loadings
contribucion contribucion
Loadings:
MR1 MR2 MR3 MR4 MR5
fixed.acidity 0.535
volatile.acidity 0.206
citric.acid
residual.sugar 0.648 0.239
chlorides 0.211
free.sulfur.dioxide 0.241 0.112
total.sulfur.dioxide 0.569 0.123 0.282
density 0.911
pH 0.244
sulphates
alcohol 0.621 0.144
MR1 MR2 MR3 MR4 MR5
SS loadings 2.065 0.372 0.118 0.088 0.050
Proportion Var 0.188 0.034 0.011 0.008 0.005
Cumulative Var 0.188 0.222 0.232 0.240 0.245
<- as.matrix(contribucion)
contribucion corrplot(contribucion, is.corr = FALSE, col = colorRampPalette(c("white", "firebrick3"))(200))
# Biplots AF sin rotación
<- fa(wine_scale, nfactors = 4, rotate = "none", fm = "paf")
fa_no_rota
# F1 y F2
biplot.psych(fa_no_rota,choose = c(1, 2), main = "Biplot: Factor 1 y Factor 2 (Sin rotación)", col = c("steelblue", "black"))
# F1 y F3
biplot.psych(fa_no_rota,choose = c(1, 3), main = "Biplot: Factor 1 y Factor 3 (Sin rotación)", col = c("firebrick", "black"))
# F1 y F4
biplot.psych(fa_no_rota,choose = c(1, 4), main = "Biplot: Factor 1 y Factor 4 (Sin rotación)", col = c("lightgreen", "black"))
# F2 y F3
biplot.psych(fa_no_rota,choose = c(2, 3), main = "Biplot: Factor 2 y Factor 3 (Sin rotación)", col = c("gold2", "black"))
# F2 y F4
biplot.psych(fa_no_rota,choose = c(2, 4), main = "Biplot: Factor 2 y Factor 4 (Sin rotación)", col = c("#808080", "black"))
# F3 y F4
biplot.psych(fa_no_rota,choose = c(3, 4), main = "Biplot: Factor 3 y Factor 4 (Sin rotación)", col = c("purple4", "black"))
# Círculos de correlación
<- mAF$loadings
cargas # F1 y F2
s.corcircle(cargas[, c(1, 2)],grid=T, sub="Factor 1 (x) y Factor 2 (y)", clabel = 1.2)
# F1 y F3
s.corcircle(cargas[, c(1, 3)],grid=T, sub="Factor 1 (x) y Factor 3 (y)", clabel = 1.2)
# F1 y F4
s.corcircle(cargas[, c(1, 4)],grid=T, sub="Factor 1 (x) y Factor 4 (y)", clabel = 1.2)
# F2 y F3
s.corcircle(cargas[, c(2, 3)],grid=T, sub="Factor 2 (x) y Factor 3 (y)", clabel = 1.2)
# F2 y F4
s.corcircle(cargas[, c(2, 4)],grid=T, sub="Factor 2 (x) y Factor 4 (y)", clabel = 1.2)
# F3 y F4
s.corcircle(cargas[, c(3, 4)],grid=T, sub="Factor 3 (x) y Factor 4 (y)", clabel = 1.2)
Rotación de los factores
Rotación ortogonal (quartimax)
# Biplots AF rotación ortogonal "biquartimax"
<- fa(wine_scale, nfactors = 4, rotate = "quartimax", fm = "paf")
fa_quartimax
# F1 y F2
biplot.psych(fa_quartimax,choose = c(1, 2), main = "Biplot: Factor 1 y Factor 2 (Rotación: quartimax)", col = c("steelblue", "black"))
# F1 y F3
biplot.psych(fa_quartimax,choose = c(1, 3), main = "Biplot: Factor 1 y Factor 3 (Rotación: quartimax)", col = c("firebrick", "black"))
# F1 y F4
biplot.psych(fa_quartimax,choose = c(1, 4), main = "Biplot: Factor 1 y Factor 4 (Rotación: quartimax)", col = c("lightgreen", "black"))
# F2 y F3
biplot.psych(fa_quartimax,choose = c(2, 3), main = "Biplot: Factor 2 y Factor 3 (Rotación: quartimax)", col = c("gold2", "black"))
# F2 y F4
biplot.psych(fa_quartimax,choose = c(2, 4), main = "Biplot: Factor 2 y Factor 4 (Rotación: quartimax)", col = c("#808080", "black"))
# F3 y F4
biplot.psych(fa_quartimax,choose = c(3, 4), main = "Biplot: Factor 3 y Factor 4 (Rotación: quartimax)", col = c("purple4", "black"))
Rotación oblicua (oblimin)
<- fa(wine_scale, nfactors = 4, rotate = "oblimin", fm = "paf")
fa_oblimin
# F1 y F2
biplot.psych(fa_oblimin,choose = c(1, 2), main = "Biplot: Factor 1 y Factor 2 (Rotación: oblimin)", col = c("steelblue", "black"))
# F1 y F3
biplot.psych(fa_oblimin,choose = c(1, 3), main = "Biplot: Factor 1 y Factor 3 (Rotación: oblimin)", col = c("firebrick", "black"))
# F1 y F4
biplot.psych(fa_oblimin,choose = c(1, 4), main = "Biplot: Factor 1 y Factor 4 (Rotación: oblimin)", col = c("lightgreen", "black"))
# F2 y F3
biplot.psych(fa_oblimin,choose = c(2, 3), main = "Biplot: Factor 2 y Factor 3 (Rotación: oblimin)", col = c("gold2", "black"))
# F2 y F4
biplot.psych(fa_oblimin,choose = c(2, 4), main = "Biplot: Factor 2 y Factor 4 (Rotación: oblimin)", col = c("#808080", "black"))
# F3 y F4
biplot.psych(fa_oblimin,choose = c(3, 4), main = "Biplot: Factor 3 y Factor 4 (Rotación: oblimin)", col = c("purple4", "black"))
# Correlación de los factores
$r.scores fa_oblimin
[,1] [,2] [,3] [,4]
[1,] 1.0000 0.53911 0.3767 0.16141
[2,] 0.5391 1.00000 0.4427 0.04545
[3,] 0.3767 0.44270 1.0000 0.18534
[4,] 0.1614 0.04545 0.1853 1.00000
Gráficos de factores
# Factores sin rotar
fa.diagram(fa_no_rota, main = "Factores sin rotación")
# Factores rotación ortogonal quartimax
fa.diagram(fa_quartimax, main = "Factores rotación ortogonal quartimax")
# Factores rotación oblicua oblimin
fa.diagram(fa_oblimin, main = "Factores rotación oblicua oblimin")
Aplicación Python
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
import factor_analyzer
from factor_analyzer import FactorAnalyzer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from scipy.spatial.distance import pdist, squareform
import pingouin as pg
from scipy.spatial.distance import mahalanobis
from scipy.stats import chi2
from scipy.stats import jarque_bera
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo
import seaborn as sns
Exploración inicial
=pd.read_csv(r"C:\Users\MINEDUCYT\Documents\Seminario\wine+quality\winequality-white.csv", sep=";")
wine_white
'display.max_rows', None) # Mostrar todas las filas
pd.set_option('display.max_columns', None) # Mostrar todas las columnas
pd.set_option(
# información de las variables
wine_white.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 4898 non-null float64
1 volatile acidity 4898 non-null float64
2 citric acid 4898 non-null float64
3 residual sugar 4898 non-null float64
4 chlorides 4898 non-null float64
5 free sulfur dioxide 4898 non-null float64
6 total sulfur dioxide 4898 non-null float64
7 density 4898 non-null float64
8 pH 4898 non-null float64
9 sulphates 4898 non-null float64
10 alcohol 4898 non-null float64
11 quality 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
# Datos descriptivos
wine_white.describe()
fixed acidity volatile acidity citric acid residual sugar \
count 4898.000000 4898.000000 4898.000000 4898.000000
mean 6.854788 0.278241 0.334192 6.391415
std 0.843868 0.100795 0.121020 5.072058
min 3.800000 0.080000 0.000000 0.600000
25% 6.300000 0.210000 0.270000 1.700000
50% 6.800000 0.260000 0.320000 5.200000
75% 7.300000 0.320000 0.390000 9.900000
max 14.200000 1.100000 1.660000 65.800000
chlorides free sulfur dioxide total sulfur dioxide density \
count 4898.000000 4898.000000 4898.000000 4898.000000
mean 0.045772 35.308085 138.360657 0.994027
std 0.021848 17.007137 42.498065 0.002991
min 0.009000 2.000000 9.000000 0.987110
25% 0.036000 23.000000 108.000000 0.991723
50% 0.043000 34.000000 134.000000 0.993740
75% 0.050000 46.000000 167.000000 0.996100
max 0.346000 289.000000 440.000000 1.038980
pH sulphates alcohol quality
count 4898.000000 4898.000000 4898.000000 4898.000000
mean 3.188267 0.489847 10.514267 5.877909
std 0.151001 0.114126 1.230621 0.885639
min 2.720000 0.220000 8.000000 3.000000
25% 3.090000 0.410000 9.500000 5.000000
50% 3.180000 0.470000 10.400000 6.000000
75% 3.280000 0.550000 11.400000 6.000000
max 3.820000 1.080000 14.200000 9.000000
wine_white.head()
fixed acidity volatile acidity citric acid residual sugar chlorides \
0 7.0 0.27 0.36 20.7 0.045
1 6.3 0.30 0.34 1.6 0.049
2 8.1 0.28 0.40 6.9 0.050
3 7.2 0.23 0.32 8.5 0.058
4 7.2 0.23 0.32 8.5 0.058
free sulfur dioxide total sulfur dioxide density pH sulphates \
0 45.0 170.0 1.0010 3.00 0.45
1 14.0 132.0 0.9940 3.30 0.49
2 30.0 97.0 0.9951 3.26 0.44
3 47.0 186.0 0.9956 3.19 0.40
4 47.0 186.0 0.9956 3.19 0.40
alcohol quality
0 8.8 6
1 9.5 6
2 10.1 6
3 9.9 6
4 9.9 6
wine_white.tail()
fixed acidity volatile acidity citric acid residual sugar chlorides \
4893 6.2 0.21 0.29 1.6 0.039
4894 6.6 0.32 0.36 8.0 0.047
4895 6.5 0.24 0.19 1.2 0.041
4896 5.5 0.29 0.30 1.1 0.022
4897 6.0 0.21 0.38 0.8 0.020
free sulfur dioxide total sulfur dioxide density pH sulphates \
4893 24.0 92.0 0.99114 3.27 0.50
4894 57.0 168.0 0.99490 3.15 0.46
4895 30.0 111.0 0.99254 2.99 0.46
4896 20.0 110.0 0.98869 3.34 0.38
4897 22.0 98.0 0.98941 3.26 0.32
alcohol quality
4893 11.2 6
4894 9.6 5
4895 9.4 6
4896 12.8 7
4897 11.8 6
# Elimar variable "quality"
=wine_white.drop("quality", axis=1) # axis=0 filas, axis=1 columnas
wine_white
wine_white.head()
fixed acidity volatile acidity citric acid residual sugar chlorides \
0 7.0 0.27 0.36 20.7 0.045
1 6.3 0.30 0.34 1.6 0.049
2 8.1 0.28 0.40 6.9 0.050
3 7.2 0.23 0.32 8.5 0.058
4 7.2 0.23 0.32 8.5 0.058
free sulfur dioxide total sulfur dioxide density pH sulphates \
0 45.0 170.0 1.0010 3.00 0.45
1 14.0 132.0 0.9940 3.30 0.49
2 30.0 97.0 0.9951 3.26 0.44
3 47.0 186.0 0.9956 3.19 0.40
4 47.0 186.0 0.9956 3.19 0.40
alcohol
0 8.8
1 9.5
2 10.1
3 9.9
4 9.9
wine_white.tail()
fixed acidity volatile acidity citric acid residual sugar chlorides \
4893 6.2 0.21 0.29 1.6 0.039
4894 6.6 0.32 0.36 8.0 0.047
4895 6.5 0.24 0.19 1.2 0.041
4896 5.5 0.29 0.30 1.1 0.022
4897 6.0 0.21 0.38 0.8 0.020
free sulfur dioxide total sulfur dioxide density pH sulphates \
4893 24.0 92.0 0.99114 3.27 0.50
4894 57.0 168.0 0.99490 3.15 0.46
4895 30.0 111.0 0.99254 2.99 0.46
4896 20.0 110.0 0.98869 3.34 0.38
4897 22.0 98.0 0.98941 3.26 0.32
alcohol
4893 11.2
4894 9.6
4895 9.4
4896 12.8
4897 11.8
# Correlaciones
# Limpia la figura actual antes de graficar
plt.clf() =(10, 8))
plt.figure(figsize=True, cmap="coolwarm", center=0, linewidths=0.5)
sns.heatmap(wine_white.corr(), annot"Mapa de calor correlaciones")
plt.title( plt.show()
# Histogramas de las variables
for columna in wine_white.columns:
plt.clf()=True, bins=15) # kde graficar curva
sns.histplot(wine_white[columna], kdef'Histograma de {columna}')
plt.title("Frecuencia")
plt.ylabel( plt.show()
Datos atípicos
Detección de datos atípicos
= wine_white - wine_white.mean()
xi_xbar = np.linalg.inv(wine_white.cov())
inv_cov = np.dot(xi_xbar, inv_cov)
parte1 = np.dot(parte1, xi_xbar.T)
parte2
= parte2.diagonal()
mahalanobis_distprint(mahalanobis_dist)
[9.26390697 6.68246735 8.2939681 ... 9.55520151 7.89464608 7.23836837]
# Valor crítico para identificar outliers (usando chi-cuadrado)
= wine_white.shape[1] # grados de libertad (número de variables)
gl = 0.05 # Nivel de significancia
alpha = chi2.ppf(1 - alpha, df=gl)
valor_critico
print("Valor crítico: ",valor_critico)
Valor crítico: 19.67513757268249
# Identificar outliers
= np.where(mahalanobis_dist > valor_critico)[0]
atipicos
# Mostrar los outliers
print("Número de datos atípicos: \n", len(atipicos))
Número de datos atípicos:
395
Limpieza de datos atípicos
# Eliminar outliers
= wine_white.drop(index=atipicos)
wine_w
print(f"Base original: {wine_white.shape[0]} filas")
Base original: 4898 filas
print(f"Base limpia: {wine_w.shape[0]} filas (se eliminaron {len(atipicos)} observaciones atípicas")
Base limpia: 4503 filas (se eliminaron 395 observaciones atípicas
Estandarizar los datos
# escalador
= StandardScaler()
scaler
# escalar la base
= scaler.fit_transform(wine_w)
wine_scale
# Dataframe de la base escalada
= pd.DataFrame(wine_scale, columns=wine_white.columns)
wine_scale
# Base escalada/estandarizada
print(wine_scale.describe())
fixed acidity volatile acidity citric acid residual sugar \
count 4.503000e+03 4.503000e+03 4.503000e+03 4.503000e+03
mean 1.767284e-16 -3.818595e-16 -4.670678e-16 -1.767284e-16
std 1.000111e+00 1.000111e+00 1.000111e+00 1.000111e+00
min -3.098146e+00 -2.236944e+00 -3.176817e+00 -1.178772e+00
25% -6.903295e-01 -7.077169e-01 -5.672597e-01 -9.351902e-01
50% -5.669373e-02 -1.195529e-01 -1.806586e-01 -2.247437e-01
75% 5.769421e-01 5.862440e-01 4.958935e-01 7.089860e-01
max 3.998575e+00 4.232861e+00 3.975304e+00 3.469578e+00
chlorides free sulfur dioxide total sulfur dioxide density \
count 4.503000e+03 4.503000e+03 4.503000e+03 4.503000e+03
mean 1.136111e-16 2.288001e-16 -2.145987e-16 6.311728e-15
std 1.000111e+00 1.000111e+00 1.000111e+00 1.000111e+00
min -2.591053e+00 -2.143602e+00 -2.908771e+00 -2.372190e+00
25% -6.834314e-01 -7.132585e-01 -7.283190e-01 -7.888627e-01
50% -1.991078e-02 -6.310253e-02 -9.133299e-02 -9.399628e-02
75% 5.606698e-01 6.520691e-01 6.926498e-01 7.356950e-01
max 6.864116e+00 3.967865e+00 3.289593e+00 2.913634e+00
pH sulphates alcohol
count 4.503000e+03 4.503000e+03 4.503000e+03
mean -1.375957e-15 -4.891589e-16 -1.375957e-15
std 1.000111e+00 1.000111e+00 1.000111e+00
min -2.756994e+00 -2.523759e+00 -1.750924e+00
25% -6.820833e-01 -7.188921e-01 -8.476427e-01
50% -5.961010e-02 -1.489342e-01 -1.085945e-01
75% 6.320268e-01 5.160168e-01 7.125703e-01
max 4.021048e+00 3.935764e+00 3.011832e+00
Prueba de supuestos
Normalidad
print("Ha: Los datos siguen una distribución normal")
Ha: Los datos siguen una distribución normal
print("Ha: Los datos no siguen una distribución normal")
Ha: Los datos no siguen una distribución normal
for columna in wine_w.columns:
= jarque_bera(wine_w[columna])
jb_stat, p_value print(f"\nVariable: {columna}")
print(f"Estadístico JB: {jb_stat:.4f}")
print(f"Valor p: {p_value:.4f}")
Variable: fixed acidity
Estadístico JB: 113.9378
Valor p: 0.0000
Variable: volatile acidity
Estadístico JB: 635.5124
Valor p: 0.0000
Variable: citric acid
Estadístico JB: 944.7620
Valor p: 0.0000
Variable: residual sugar
Estadístico JB: 434.1879
Valor p: 0.0000
Variable: chlorides
Estadístico JB: 8411.9283
Valor p: 0.0000
Variable: free sulfur dioxide
Estadístico JB: 113.9814
Valor p: 0.0000
Variable: total sulfur dioxide
Estadístico JB: 77.0026
Valor p: 0.0000
Variable: density
Estadístico JB: 169.2974
Valor p: 0.0000
Variable: pH
Estadístico JB: 102.9806
Valor p: 0.0000
Variable: sulphates
Estadístico JB: 371.7906
Valor p: 0.0000
Variable: alcohol
Estadístico JB: 253.3940
Valor p: 0.0000
Prueba de Bartlett
print("Ha: La matriz de correlaciones es la matriz identidad")
Ha: La matriz de correlaciones es la matriz identidad
print("Ha: La matriz de correlaciones no es la matriz identidad")
Ha: La matriz de correlaciones no es la matriz identidad
= calculate_bartlett_sphericity(wine_w)
p_value print(f"p-valor: {p_value}")
p-valor: (np.float64(26590.217856052033), np.float64(0.0))
Prueba KMO
= calculate_kmo(wine_w)
kmo_all, kmo_model print(kmo_all)
[0.12953995 0.28670479 0.72465744 0.31699293 0.82259387 0.60730888
0.68125771 0.40296197 0.12415828 0.18852687 0.36168296]
print(kmo_model)
0.36653747127663994
Análisis Factorial Exploratorio (EFA)
# Crear el modelo
= FactorAnalyzer(n_factors=4, method='principal', rotation=None)
fa_sin_rotacion
# Aplicar a la base
fa_sin_rotacion.fit(wine_scale)
FactorAnalyzer(method='principal', n_factors=4, rotation=None, rotation_kwargs={})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
FactorAnalyzer(method='principal', n_factors=4, rotation=None, rotation_kwargs={})
# Varianza explicada
= fa_sin_rotacion.get_factor_variance()
varianza print("Varianza por factor:", varianza[1]*100,"%")
Varianza por factor: [31.46936965 14.31140846 11.19905582 9.62399505] %
Comunalidades
= fa_sin_rotacion.get_communalities()
comunalidades print("Comunalidades:", comunalidades)
Comunalidades: [0.6414543 0.56938736 0.60338116 0.65712336 0.57522096 0.76498
0.75300714 0.90217532 0.6529997 0.41204653 0.79464536]
Unicidades
= 1-fa_sin_rotacion.get_communalities()
unicidades print("unicidades:", unicidades)
unicidades: [0.3585457 0.43061264 0.39661884 0.34287664 0.42477904 0.23502
0.24699286 0.09782468 0.3470003 0.58795347 0.20535464]
Determinar número de factores
# 1. Eigenvalores
= fa_sin_rotacion.get_eigenvalues()
autovalores, _ print("Autovalores:", autovalores)
Autovalores: [3.46163066 1.57425493 1.23189614 1.05863946 0.95322006 0.75337666
0.72039041 0.57133279 0.38897996 0.27420026 0.01207867]
# 2. Gráfico de sedimentación
plt.clf() range(1, len(autovalores)+1), autovalores, 'o-') plt.plot(
[<matplotlib.lines.Line2D object at 0x000001A211B3B4D0>]
=1, color='r', linestyle='--') plt.axhline(y
<matplotlib.lines.Line2D object at 0x000001A2189F1590>
"Número de Factores") plt.xlabel(
Text(0.5, 0, 'Número de Factores')
"Autovalores") plt.ylabel(
Text(0, 0.5, 'Autovalores')
"Gráfico de sedimentación") plt.title(
Text(0.5, 1.0, 'Gráfico de sedimentación')
plt.show()
Cargas factoriales
= fa_sin_rotacion.loadings_
cargas print(cargas)
[[ 0.24254434 -0.75475996 0.1099159 -0.0297059 ]
[-0.00883745 0.10944178 -0.61543426 0.42257831]
[ 0.20178308 -0.4783024 0.56144353 0.13664821]
[ 0.77565978 -0.02454286 -0.20811914 0.10751435]
[ 0.55710835 0.09994367 -0.17144426 -0.47483615]
[ 0.56614519 0.24582749 0.29703696 0.5438727 ]
[ 0.73866287 0.24000483 0.12472533 0.36636808]
[ 0.93358465 -0.00943635 -0.08982329 -0.14979238]
[-0.19354225 0.73505749 0.23531004 -0.14092825]
[ 0.11065673 0.30292328 0.52614027 -0.17667913]
[-0.81398007 -0.05528124 0.09528781 0.34633225]]
= pd.DataFrame(cargas, index=wine_w.columns)
cargas2 cargas2
0 1 2 3
fixed acidity 0.242544 -0.754760 0.109916 -0.029706
volatile acidity -0.008837 0.109442 -0.615434 0.422578
citric acid 0.201783 -0.478302 0.561444 0.136648
residual sugar 0.775660 -0.024543 -0.208119 0.107514
chlorides 0.557108 0.099944 -0.171444 -0.474836
free sulfur dioxide 0.566145 0.245827 0.297037 0.543873
total sulfur dioxide 0.738663 0.240005 0.124725 0.366368
density 0.933585 -0.009436 -0.089823 -0.149792
pH -0.193542 0.735057 0.235310 -0.140928
sulphates 0.110657 0.302923 0.526140 -0.176679
alcohol -0.813980 -0.055281 0.095288 0.346332
# Contribución
= cargas2*cargas2
contribucion print(contribucion)
0 1 2 3
fixed acidity 0.058828 0.569663 0.012082 0.000882
volatile acidity 0.000078 0.011978 0.378759 0.178572
citric acid 0.040716 0.228773 0.315219 0.018673
residual sugar 0.601648 0.000602 0.043314 0.011559
chlorides 0.310370 0.009989 0.029393 0.225469
free sulfur dioxide 0.320520 0.060431 0.088231 0.295798
total sulfur dioxide 0.545623 0.057602 0.015556 0.134226
density 0.871580 0.000089 0.008068 0.022438
pH 0.037459 0.540310 0.055371 0.019861
sulphates 0.012245 0.091763 0.276824 0.031216
alcohol 0.662564 0.003056 0.009080 0.119946
=contribucion.values
contribucion
plt.clf() =(10, 8)) plt.figure(figsize
<Figure size 1000x800 with 0 Axes>
=True, cmap="coolwarm", center=0, linewidths=0.5) sns.heatmap(contribucion, annot
<Axes: >
plt.ylabel(wine_w.columns)
Text(95.72222222222221, 0.5, "Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',\n 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',\n 'pH', 'sulphates', 'alcohol'],\n dtype='object')")
"Mapa de calor: contribución a los factores") plt.title(
Text(0.5, 1.0, 'Mapa de calor: contribución a los factores')
plt.show()
# Gráficos de cargas y scores
#####################################################################
def biplot(fa, X, choose=(0, 1), title="Biplot", point_color=None, vector_color='black'):
plt.clf() # Obtener cargas factoriales y puntuaciones
= fa.loadings_[:, choose]
loadings = fa.transform(X)[:, choose]
scores
# Escalar las puntuaciones para mejor visualización
= (scores - scores.mean(axis=0)) / scores.std(axis=0)
scores
# Crear figura
= plt.subplots(figsize=(10, 8))
fig, ax
# Graficar observaciones (puntos)
0], scores[:, 1], alpha=0.7, color=point_color)
ax.scatter(scores[:,
# Graficar variables (vectores)
for i, (x, y) in enumerate(loadings):
0, 0, x, y, color=vector_color, head_width=0.05, length_includes_head=True)
ax.arrow(*1.15, y*1.15, X.columns[i], color=vector_color, ha='center', va='center')
ax.text(x
# Líneas de referencia
0, color='gray', linestyle='--', linewidth=0.5)
ax.axhline(0, color='gray', linestyle='--', linewidth=0.5)
ax.axvline(
# Etiquetas y título
f'Factor {choose[0]+1}')
ax.set_xlabel(f'Factor {choose[1]+1}')
ax.set_ylabel(
ax.set_title(title)
True, linestyle='--', alpha=0.3)
plt.grid(
plt.tight_layout()
plt.show()
# Paleta de colores
= ['#FFFF00', '#00FFFF', '#FF00FF', '#00FF00', '#FF0000', '#0892D0']
paleta_colores
# Asignación de colores a cada combinación de factores
= [((0, 1), "Factor 1 y Factor 2"), ((0, 2), "Factor 1 y Factor 3"), ((0, 3), "Factor 1 y Factor 4"), ((1, 2), "Factor 2 y Factor 3"), ((1, 3), "Factor 2 y Factor 4"), ((2, 3), "Factor 3 y Factor 4")]
combinaciones_factores #######################################################################
# Generar todos los biplots con colores distintos
for (i, j), title_suffix in combinaciones_factores:
= i + j - 1 # Asignación de color
color_idx
biplot(fa_sin_rotacion, wine_scale, =(i, j),
choose=f"Biplot: {title_suffix} (Sin rotación)",
title=paleta_colores[color_idx % len(paleta_colores)],
point_color='black') vector_color
Rotación de los factores
Rotación ortogonal (varimax)
# Crear el modelo
= FactorAnalyzer(n_factors=4, method='principal', rotation="varimax")
fa_varimax
# Aplicar a la base
fa_varimax.fit(wine_scale)
FactorAnalyzer(method='principal', n_factors=4, rotation='varimax',
rotation_kwargs={})
# Gráficos de cargas y scores
# Generar todos los biplots con colores distintos
for (i, j), title_suffix in combinaciones_factores:
= i + j - 1 # Asignación de color
color_idx
biplot(fa_varimax, wine_scale, =(i, j),
choose=f"Biplot: {title_suffix} (Rotación=Varimax)",
title=paleta_colores[color_idx % len(paleta_colores)],
point_color='black') vector_color
Rotación oblicua (promax)
# Crear el modelo
= FactorAnalyzer(n_factors=4, method='principal', rotation="promax")
fa_promax
# Aplicar a la base
fa_promax.fit(wine_scale)
FactorAnalyzer(method='principal', n_factors=4, rotation_kwargs={})
# Gráficos de cargas y scores
# Generar todos los biplots con colores distintos
for (i, j), title_suffix in combinaciones_factores:
= i + j - 1 # Asignación de color
color_idx
biplot(fa_promax, wine_scale, =(i, j),
choose=f"Biplot: {title_suffix} (Rotación=Promax)",
title=paleta_colores[color_idx % len(paleta_colores)],
point_color='black') vector_color
# Correlaciones en los factores
# Puntuaciones factoriales
= fa_promax.transform(wine_scale)
fa_scores
# Convertir a DataFrame
= pd.DataFrame(fa_scores, columns=[f'Factor{i+1}' for i in range(fa_scores.shape[1])])
fa_scores_df
# Calcular matriz de correlaciones
= fa_scores_df.corr()
corr_fa_promax print(corr_fa_promax)
Factor1 Factor2 Factor3 Factor4
Factor1 1.000000 0.138909 -0.085543 0.418329
Factor2 0.138909 1.000000 -0.021976 0.134548
Factor3 -0.085543 -0.021976 1.000000 0.100826
Factor4 0.418329 0.134548 0.100826 1.000000
Gráficos de factores
### Factores sin rotación
= fa_sin_rotacion.loadings_
loadings = pd.DataFrame(loadings,
loadings_no_rota =[f'Factor{i+1}' for i in range(loadings.shape[1])])
columns
= loadings_no_rota.where(np.abs(loadings_no_rota) >= 0.50)
loadings_altos = [f'{wine_scale.columns[i]}' for i in range(wine_scale.shape[1])]
loadings_altos.index
# Graficar
plt.clf()=(10, 6)) plt.figure(figsize
<Figure size 1000x600 with 0 Axes>
=True, cmap='coolwarm', center=0) sns.heatmap(loadings_altos, annot
<Axes: >
"Cargas factoriales - Sin rotación") plt.title(
Text(0.5, 1.0, 'Cargas factoriales - Sin rotación')
"Factores") plt.xlabel(
Text(0.5, 36.72222222222221, 'Factores')
"Variables") plt.ylabel(
Text(95.72222222222221, 0.5, 'Variables')
plt.show()
### Factores rotación ortogonal varimax
= fa_varimax.loadings_
loadings = pd.DataFrame(loadings,
loadings_varimax =[f'Factor{i+1}' for i in range(loadings.shape[1])])
columns
= loadings_varimax.where(np.abs(loadings_varimax) >= 0.50)
loadings_altos = [f'{wine_scale.columns[i]}' for i in range(wine_scale.shape[1])]
loadings_altos.index
# Graficar
plt.clf()=(10, 6)) plt.figure(figsize
<Figure size 1000x600 with 0 Axes>
=True, cmap='coolwarm', center=0) sns.heatmap(loadings_altos, annot
<Axes: >
"Cargas factoriales - Rotación Varimax") plt.title(
Text(0.5, 1.0, 'Cargas factoriales - Rotación Varimax')
"Factores") plt.xlabel(
Text(0.5, 36.72222222222221, 'Factores')
"Variables") plt.ylabel(
Text(95.72222222222221, 0.5, 'Variables')
plt.show()
### Factores rotación oblliua promax
= fa_promax.loadings_
loadings = pd.DataFrame(loadings,
loadings_promax =[f'Factor{i+1}' for i in range(loadings.shape[1])])
columns
= loadings_promax.where(np.abs(loadings_promax) >= 0.50)
loadings_altos
= [f'{wine_scale.columns[i]}' for i in range(wine_scale.shape[1])]
loadings_altos.index
# Graficar
plt.clf()=(10, 6)) plt.figure(figsize
<Figure size 1000x600 with 0 Axes>
=True, cmap='coolwarm', center=0) sns.heatmap(loadings_altos, annot
<Axes: >
"Cargas factoriales - Rotación Varimax") plt.title(
Text(0.5, 1.0, 'Cargas factoriales - Rotación Varimax')
"Factores") plt.xlabel(
Text(0.5, 36.72222222222221, 'Factores')
"Variables") plt.ylabel(
Text(95.72222222222221, 0.5, 'Variables')
plt.show()
### Correlaciones entre factores (rotación oblicua)
plt.clf()=(10, 8)) plt.figure(figsize
<Figure size 1000x800 with 0 Axes>
=True, cmap="coolwarm", center=0, linewidths=0.5) sns.heatmap(corr_fa_promax, annot
<Axes: >
"Mapa de calor correlaciones de los factores - Rotación oblicua promax") plt.title(
Text(0.5, 1.0, 'Mapa de calor correlaciones de los factores - Rotación oblicua promax')
plt.show()