CASO PRÁCTICO: Textura de alimentos.
Descripción. Medidas de textura de un alimento tipo pastelería
Fuente de datos:
Objetivo: Realizar el análisis correspondiente y evaluar si es conveniente el análisis factorial
# elimina objetos en su espacio de trabajo
rm(list = ls())
# elimina la notación cientifica
options(scipen=999)
# cargamos la data set
library(readr)
data = read_csv("food-texture.csv")
head(data)
## # A tibble: 6 × 6
## ...1 Oil Density Crispy Fracture Hardness
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 B110 16.5 2955 10 23 97
## 2 B136 17.7 2660 14 9 139
## 3 B171 16.2 2870 12 17 143
## 4 B192 16.7 2920 10 31 95
## 5 B225 16.3 2975 11 26 143
## 6 B237 19.1 2790 13 16 189
# No considerar la primera columna Id
data$...1 = NULL
Verificamos que la data no contenga valores missing
# verificamos valores missing
library(visdat)
vis_miss(data ,sort_miss = TRUE)
# Evaluando valores outliers mediante distancia Mahalanobis (D2)
library(psych)
outlier(data, plot=T, bad=5, na.rm=T)
## 1 2 3 4 5 6 7
## 2.4297219 8.6501824 2.0579031 5.8035487 2.8518485 7.5797345 1.4794609
## 8 9 10 11 12 13 14
## 5.2534420 1.6406785 0.9213588 2.9021603 3.2941249 5.5856819 8.0601591
## 15 16 17 18 19 20 21
## 5.7751095 10.1395884 7.0000590 1.2811307 9.0562884 4.6911646 8.7955549
## 22 23 24 25 26 27 28
## 5.6641208 1.9717431 24.3698192 4.4734014 3.7006304 1.5221735 1.3355215
## 29 30 31 32 33 34 35
## 2.2127160 6.4011472 6.6897464 5.4058251 8.2283364 3.6987644 5.6492164
## 36 37 38 39 40 41 42
## 7.1105302 6.6481897 5.8988650 1.0506025 2.1311589 2.5637181 0.8551690
## 43 44 45 46 47 48 49
## 0.3795618 9.1031456 3.1512040 1.1715058 3.2413791 4.1430455 6.9684749
## 50
## 4.0113873
H0: Hay Normalidad multivariante
H1: No hay normalidad multivariante
Regla de decisión: p-valor < alfa ===> Se rechaza Ho
library (MVN)
mvn(data = data, mvnTest = "mardia") #Test de Mardia
## $multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 67.8598697068406 0.000718316314647963 NO
## 2 Mardia Kurtosis 2.20388414586882 0.0275324928017431 NO
## 3 MVN <NA> <NA> NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Anderson-Darling Oil 0.6207 0.1004 YES
## 2 Anderson-Darling Density 0.3303 0.5066 YES
## 3 Anderson-Darling Crispy 1.0865 0.0068 NO
## 4 Anderson-Darling Fracture 0.2447 0.7491 YES
## 5 Anderson-Darling Hardness 0.2707 0.6609 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th 75th
## Oil 50 17.202 1.592007 16.9 13.7 21.2 16.30 18.10
## Density 50 2857.600 124.499980 2867.5 2570.0 3125.0 2772.50 2945.00
## Crispy 50 11.520 1.775571 12.0 7.0 15.0 10.00 13.00
## Fracture 50 20.860 5.466073 21.0 9.0 33.0 17.00 25.00
## Hardness 50 128.180 31.127578 126.0 63.0 192.0 107.25 143.75
## Skew Kurtosis
## Oil 0.414609911 0.2968479
## Density -0.179909843 -0.4617514
## Crispy -0.278751503 -0.4806965
## Fracture -0.107763984 -0.6118981
## Hardness 0.006626536 -0.4208868
mvn(data = data, mvnTest = "energy") #Energy test
## $multivariateNormality
## Test Statistic p value MVN
## 1 E-statistic 1.402616 0.006 NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Anderson-Darling Oil 0.6207 0.1004 YES
## 2 Anderson-Darling Density 0.3303 0.5066 YES
## 3 Anderson-Darling Crispy 1.0865 0.0068 NO
## 4 Anderson-Darling Fracture 0.2447 0.7491 YES
## 5 Anderson-Darling Hardness 0.2707 0.6609 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th 75th
## Oil 50 17.202 1.592007 16.9 13.7 21.2 16.30 18.10
## Density 50 2857.600 124.499980 2867.5 2570.0 3125.0 2772.50 2945.00
## Crispy 50 11.520 1.775571 12.0 7.0 15.0 10.00 13.00
## Fracture 50 20.860 5.466073 21.0 9.0 33.0 17.00 25.00
## Hardness 50 128.180 31.127578 126.0 63.0 192.0 107.25 143.75
## Skew Kurtosis
## Oil 0.414609911 0.2968479
## Density -0.179909843 -0.4617514
## Crispy -0.278751503 -0.4806965
## Fracture -0.107763984 -0.6118981
## Hardness 0.006626536 -0.4208868
mvn(data = data, mvnTest = "royston") #Test de Test Royston
## $multivariateNormality
## Test H p value MVN
## 1 Royston 4.052572 0.239368 YES
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Anderson-Darling Oil 0.6207 0.1004 YES
## 2 Anderson-Darling Density 0.3303 0.5066 YES
## 3 Anderson-Darling Crispy 1.0865 0.0068 NO
## 4 Anderson-Darling Fracture 0.2447 0.7491 YES
## 5 Anderson-Darling Hardness 0.2707 0.6609 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th 75th
## Oil 50 17.202 1.592007 16.9 13.7 21.2 16.30 18.10
## Density 50 2857.600 124.499980 2867.5 2570.0 3125.0 2772.50 2945.00
## Crispy 50 11.520 1.775571 12.0 7.0 15.0 10.00 13.00
## Fracture 50 20.860 5.466073 21.0 9.0 33.0 17.00 25.00
## Hardness 50 128.180 31.127578 126.0 63.0 192.0 107.25 143.75
## Skew Kurtosis
## Oil 0.414609911 0.2968479
## Density -0.179909843 -0.4617514
## Crispy -0.278751503 -0.4806965
## Fracture -0.107763984 -0.6118981
## Hardness 0.006626536 -0.4208868
mvn(data = data, mvnTest = "hz") #Test de Henze-Zirkler
## $multivariateNormality
## Test HZ p value MVN
## 1 Henze-Zirkler 0.9749839 0.03723508 NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Anderson-Darling Oil 0.6207 0.1004 YES
## 2 Anderson-Darling Density 0.3303 0.5066 YES
## 3 Anderson-Darling Crispy 1.0865 0.0068 NO
## 4 Anderson-Darling Fracture 0.2447 0.7491 YES
## 5 Anderson-Darling Hardness 0.2707 0.6609 YES
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th 75th
## Oil 50 17.202 1.592007 16.9 13.7 21.2 16.30 18.10
## Density 50 2857.600 124.499980 2867.5 2570.0 3125.0 2772.50 2945.00
## Crispy 50 11.520 1.775571 12.0 7.0 15.0 10.00 13.00
## Fracture 50 20.860 5.466073 21.0 9.0 33.0 17.00 25.00
## Hardness 50 128.180 31.127578 126.0 63.0 192.0 107.25 143.75
## Skew Kurtosis
## Oil 0.414609911 0.2968479
## Density -0.179909843 -0.4617514
## Crispy -0.278751503 -0.4806965
## Fracture -0.107763984 -0.6118981
## Hardness 0.006626536 -0.4208868
H0:Las varianzas de todos los grupos son iguales.
H1:Al menos una de las varianzas de los grupos es diferente.
library(stats)
bartlett.test(data)
##
## Bartlett test of homogeneity of variances
##
## data: data
## Bartlett's K-squared = 899.63, df = 4, p-value < 0.00000000000000022
como el p-value = 0.00000000000000022 < 0.05 tenemos evidencia suficiente para rechazar la hipótesis nula
La KMO mide la adecuación de los datos para el análisis factorial, indicando si la correlación entre las variables es lo suficientemente alta como para proceder con la extracción de componentes principales o factores
library(psych)
KMO(data)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = data)
## Overall MSA = 0.71
## MSA for each item =
## Oil Density Crispy Fracture Hardness
## 0.82 0.71 0.67 0.79 0.43
el Overall MSA = 0.71 indica que se considera aceptable para realizar un análisis factorial
Busca maximizar la probabilidad de observar los datos bajo el modelo de análisis factorial. Es útil cuando se asume una distribución específica para los datos (por ejemplo, distribución normal multivariante).
# modelo máxima verosimilitud
modelo1<-fa(data, nfactors = 3,rotate = "none", fm="mle")
modelo1
## Factor Analysis using method = ml
## Call: fa(r = data, nfactors = 3, rotate = "none", fm = "mle")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 ML3 h2 u2 com
## Oil 0.63 -0.52 -0.08 0.68 0.322 2.0
## Density -0.71 0.58 -0.02 0.84 0.158 1.9
## Crispy 0.99 0.05 0.03 0.98 0.019 1.0
## Fracture -0.86 -0.06 0.26 0.81 0.193 1.2
## Hardness 0.38 0.65 -0.04 0.57 0.430 1.6
##
## ML1 ML2 ML3
## SS loadings 2.76 1.04 0.08
## Proportion Var 0.55 0.21 0.02
## Cumulative Var 0.55 0.76 0.78
## Proportion Explained 0.71 0.27 0.02
## Cumulative Proportion 0.71 0.98 1.00
##
## Mean item complexity = 1.5
## Test of the hypothesis that 3 factors are sufficient.
##
## df null model = 10 with the objective function = 3.33 with Chi Square = 154.99
## df of the model are -2 and the objective function was 0
##
## The root mean square of the residuals (RMSR) is 0
## The df corrected root mean square of the residuals is NA
##
## The harmonic n.obs is 50 with the empirical chi square 0 with prob < NA
## The total n.obs was 50 with Likelihood Chi Square = 0 with prob < NA
##
## Tucker Lewis Index of factoring reliability = 1.072
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## ML1 ML2 ML3
## Correlation of (regression) scores with factors 0.99 0.90 0.54
## Multiple R square of scores with factors 0.98 0.81 0.30
## Minimum correlation of possible factor scores 0.97 0.61 -0.41
Aunque no es estrictamente un método de análisis factorial, PCA es ampliamente utilizado para extraer factores. Los componentes principales son combinaciones lineales de las variables originales que explican la mayor varianza posible en los datos. Sin embargo, a diferencia del análisis factorial, PCA no asume una estructura latente específica en los datos
# modelo minimo residuo
modelo2<-fa(data, nfactors = 3,rotate = "none",fm="minres")
modelo2
## Factor Analysis using method = minres
## Call: fa(r = data, nfactors = 3, rotate = "none", fm = "minres")
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 MR2 MR3 h2 u2 com
## Oil 0.74 -0.38 -0.09 0.69 0.308 1.5
## Density -0.82 0.42 -0.08 0.85 0.154 1.5
## Crispy 0.95 0.26 0.09 0.97 0.032 1.2
## Fracture -0.84 -0.24 0.10 0.77 0.235 1.2
## Hardness 0.24 0.72 0.00 0.58 0.420 1.2
##
## MR1 MR2 MR3
## SS loadings 2.86 0.96 0.03
## Proportion Var 0.57 0.19 0.01
## Cumulative Var 0.57 0.76 0.77
## Proportion Explained 0.74 0.25 0.01
## Cumulative Proportion 0.74 0.99 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 3 factors are sufficient.
##
## df null model = 10 with the objective function = 3.33 with Chi Square = 154.99
## df of the model are -2 and the objective function was 0
##
## The root mean square of the residuals (RMSR) is 0
## The df corrected root mean square of the residuals is NA
##
## The harmonic n.obs is 50 with the empirical chi square 0 with prob < NA
## The total n.obs was 50 with Likelihood Chi Square = 0 with prob < NA
##
## Tucker Lewis Index of factoring reliability = 1.072
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## MR1 MR2 MR3
## Correlation of (regression) scores with factors 0.98 0.90 0.40
## Multiple R square of scores with factors 0.97 0.81 0.16
## Minimum correlation of possible factor scores 0.93 0.62 -0.69
# comparando las comunalidades
sort(modelo1$communality,decreasing = T)->c1
sort(modelo2$communality,decreasing = T)->c2
head(cbind(c1,c2))
## c1 c2
## Crispy 0.9807396 0.9684370
## Density 0.8422707 0.8455529
## Fracture 0.8073597 0.7651522
## Oil 0.6776512 0.6922706
## Hardness 0.5700114 0.5798243
# comparacion de las unicidades
sort(modelo1$uniquenesses,decreasing = T)->u1
sort(modelo2$uniquenesses,decreasing = T)->u2
head(cbind(u1,u2))
## u1 u2
## Hardness 0.42998859 0.42017568
## Oil 0.32234878 0.30772938
## Fracture 0.19264032 0.23484777
## Density 0.15772927 0.15444711
## Crispy 0.01926036 0.03156303
Los dos métodos más comunes para extraer los factores son:
El más común porque es fácil comparar entre varias combinaciones de ítems con el mismo método de estimación. Es sensible a violaciones de normalidad pero se corrige cuando especificamos la correlación correcta (ej. polychoric).
library(psych)
scree(data)
Preferible con escalas continuas no-normales. Pero menos comparable entre modelos.
library(psych)
fa.parallel(data,n.obs=200,fa="fa",fm="minres")
## Parallel analysis suggests that the number of factors = 2 and the number of components = NA
rotar la matriz en el análisis factorial es una técnica esencial para obtener una estructura más interpretable y significativa de los factores extraídos. La elección de qué tipo de rotación utilizar depende de la naturaleza de los datos y de las suposiciones teóricas subyacentes. La rotación puede ayudar a simplificar y aclarar la interpretación de los resultados del análisis factorial y hacer que los factores extraídos sean más coherentes con las teorías o conceptos subyacentes
En la rotación ortogonal, los factores se mantienen no correlacionados entre sí después de la rotación. Los métodos de rotación ortogonal más comunes son la rotación Varimax y la rotación Equamax, entre otros. Estos métodos buscan minimizar la complejidad de los factores, lo que puede facilitar la interpretación
library(psych)
# Realizar análisis factorial con rotación Varimax
factor_analisis <- principal(data, nfactors = 2, rotate = "varimax")
# Ver los resultados del análisis
print(factor_analisis)
## Principal Components Analysis
## Call: principal(r = data, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 h2 u2 com
## Oil -0.90 -0.08 0.81 0.19 1.0
## Density 0.93 0.05 0.86 0.14 1.0
## Crispy -0.77 0.57 0.91 0.09 1.8
## Fracture 0.71 -0.57 0.83 0.17 1.9
## Hardness 0.11 0.95 0.91 0.09 1.0
##
## RC1 RC2
## SS loadings 2.77 1.56
## Proportion Var 0.55 0.31
## Cumulative Var 0.55 0.87
## Proportion Explained 0.64 0.36
## Cumulative Proportion 0.64 1.00
##
## Mean item complexity = 1.4
## Test of the hypothesis that 2 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.06
## with the empirical chi square 3.55 with prob < 0.06
##
## Fit based upon off diagonal values = 0.99
En la rotación oblicua, se permite que los factores extraídos estén correlacionados entre sí. Esto puede reflejar una estructura más realista de las relaciones entre variables en el mundo real. La rotación oblicua es útil cuando se espera que los factores estén correlacionados en función del conocimiento previo o teorías existentes.
# Realizar análisis factorial con rotación Promin (oblicua)
factor_analisis_oblicuo <- fa(data, nfactors = 2, rotate = "promax")
# Ver los resultados del análisis
print(factor_analisis_oblicuo)
## Factor Analysis using method = minres
## Call: fa(r = data, nfactors = 2, rotate = "promax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 MR2 h2 u2 com
## Oil -0.87 -0.19 0.68 0.323 1.1
## Density 0.97 0.21 0.83 0.169 1.1
## Crispy -0.66 0.51 0.95 0.048 1.9
## Fracture 0.57 -0.47 0.75 0.252 1.9
## Hardness 0.26 0.83 0.60 0.399 1.2
##
## MR1 MR2
## SS loadings 2.54 1.27
## Proportion Var 0.51 0.25
## Cumulative Var 0.51 0.76
## Proportion Explained 0.67 0.33
## Cumulative Proportion 0.67 1.00
##
## With factor correlations of
## MR1 MR2
## MR1 1.00 -0.37
## MR2 -0.37 1.00
##
## Mean item complexity = 1.4
## Test of the hypothesis that 2 factors are sufficient.
##
## df null model = 10 with the objective function = 3.33 with Chi Square = 154.99
## df of the model are 1 and the objective function was 0.01
##
## The root mean square of the residuals (RMSR) is 0
## The df corrected root mean square of the residuals is 0.02
##
## The harmonic n.obs is 50 with the empirical chi square 0.02 with prob < 0.87
## The total n.obs was 50 with Likelihood Chi Square = 0.32 with prob < 0.57
##
## Tucker Lewis Index of factoring reliability = 1.048
## RMSEA index = 0 and the 90 % confidence intervals are 0 0.312
## BIC = -3.59
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## MR1 MR2
## Correlation of (regression) scores with factors 0.97 0.94
## Multiple R square of scores with factors 0.94 0.88
## Minimum correlation of possible factor scores 0.88 0.76
Método de rotación ortogonal que minimiza el número de variables que tienen saturaciones altas en cada factor. Simplifica la interpretación de los factores.
El método necesita un valor delta que servirá para ajustar los ejes en función de las saturaciones buscan una mejor aproximación, pero considerando que la varianza se distribuirá entre todos los factores.
Método de rotación que minimiza el número de factores necesarios para explicar cada variable.
Método de rotación que es combinación del método varimax, que simplifica los factores, y el método quartimax, que simplifica las variables. Se minimiza tanto el número de variables que saturan alto en un factor como el número de factores necesarios para explicar una variable.
Rotación oblicua que permite que los factores estén correlacionados. Esta rotación se puede calcular más rápidamente que una rotación oblimin directa, por lo que es útil para conjuntos de datos grandes.
library(GPArotation)
rot= c("none", "varimax", "quartimax","Promax")
bi_mod = function(tipo){
biplot.psych(fa(data,nfactors = 2,fm="minres",rotate = tipo),main = paste("Biplot con rotación ",tipo),col=c(2,3,4),pch = c(21,18),group = bfi[,"gender"])
}
sapply(rot,bi_mod)
## $none
## NULL
##
## $varimax
## NULL
##
## $quartimax
## NULL
##
## $Promax
## NULL
library(psych)
# Realizar análisis factorial
factor_analisis <- fa(data, nfactors = 2, rotate = "varimax")
# Obtener las puntuaciones factoriales
puntuaciones <- factor.scores(data, factor_analisis$loadings)
# Ver las puntuaciones factoriales
print(puntuaciones)
## $scores
## MR1 MR2
## [1,] 0.592800522 -0.63736997
## [2,] -1.360269685 0.72970701
## [3,] 0.188487196 0.69735431
## [4,] 0.482081164 -1.04668809
## [5,] 0.864292318 0.41047603
## [6,] -0.494653562 0.98322701
## [7,] -1.010066630 0.03920159
## [8,] -0.523896015 -2.03777664
## [9,] 0.721295367 0.26068124
## [10,] 0.633838352 0.19487550
## [11,] -0.425028743 0.07762536
## [12,] 0.041859393 0.67819921
## [13,] -0.421848778 1.79450928
## [14,] 1.137092893 1.66996846
## [15,] 0.592475537 -1.57951690
## [16,] -0.508816345 2.19464034
## [17,] -1.640810769 -0.04706609
## [18,] 0.089646545 0.61921438
## [19,] -1.026613800 0.10826370
## [20,] -1.171062291 0.93102306
## [21,] 1.771145632 0.74331826
## [22,] 1.181298085 0.20429503
## [23,] -0.640910181 0.69318308
## [24,] -0.098635815 -2.73020795
## [25,] -0.388422408 0.82853314
## [26,] -0.071064001 0.58433272
## [27,] 0.573835762 -0.71276991
## [28,] 0.533478183 -0.63241610
## [29,] -0.338653588 0.70681356
## [30,] -1.724940495 -1.59205745
## [31,] 1.267892865 0.08876734
## [32,] 0.216568676 -1.03169008
## [33,] 2.020544374 -1.55998011
## [34,] -0.553114189 0.44052057
## [35,] 0.709316944 1.31592910
## [36,] -2.470208340 -0.63413053
## [37,] -1.698413224 -0.88795803
## [38,] -1.429219292 0.50637314
## [39,] -0.017411344 -0.54294312
## [40,] 0.724609523 -0.56656513
## [41,] 0.954024558 -0.06888391
## [42,] -0.002967509 0.31116650
## [43,] 0.550385094 0.19735394
## [44,] 1.882120299 -0.57683451
## [45,] -0.052752540 0.24333145
## [46,] 0.202138967 -0.34019848
## [47,] 0.847037696 1.38012731
## [48,] 0.883434855 -0.49151049
## [49,] -1.649005979 -0.50572861
## [50,] 0.057084717 -1.41071953
##
## $weights
## MR1 MR2
## Oil -0.25497935 -0.2221754
## Density 0.52046507 0.5391284
## Crispy -0.28885480 0.8776854
## Fracture 0.05477739 -0.1416914
## Hardness 0.19520322 0.3496994
##
## $r.scores
## MR1 MR2
## MR1 1.0000000000000000000000 0.0000000000000001179612
## MR2 0.0000000000000001110223 1.0000000000000000000000
##
## $missing
## [1] FALSE
##
## $R2
## [1] 0.9543456 0.9194812
modelo_varimax = fa(data,nfactors = 2,rotate = "varimax",
fa="minres")
fa.diagram(modelo_varimax)