library(readxl)
library(collapsibleTree)
df= read_excel("D:/Documentos/010 metodos multivariados/clases/Clase_19 (22-07-2021)/caract_cacao.xlsx",'data')
head(df)
## # A tibble: 6 x 6
## c_s c_m rug cb fa fm
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 3 4 3 4
## 2 1 1 2 2 2 2
## 3 1 1 2 1 3 1
## 4 1 1 2 3 2 3
## 5 1 1 2 3 2 3
## 6 1 1 1 3 3 3
df2=within(df,{
c_s=as.factor(c_s)
c_m=as.factor(c_m)
rug=as.factor(rug)
cb=as.factor(cb)
fa=as.factor(fa)
fm=as.factor(fm)
})
Arbol de categorias (combinaciones de variables)
collapsibleTree(df,hierarchy=c('c_s','c_m','rug','cb','fa','fm'))
Los diagramas de sectores o tortas no son buenos para mostrar un atributo, permite buscar máximos y mínimos, la mayoria son univariados. y no deben tener más de 12 categorias
pie(table(df$fm))
pie (table(df$c_s))
cm_fm=paste('c_m',df$c_m, 'fm',df$fm)
cm_fm
## [1] "c_m 1 fm 4" "c_m 1 fm 2" "c_m 1 fm 1" "c_m 1 fm 3" "c_m 1 fm 3"
## [6] "c_m 1 fm 3" "c_m 0 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 2"
## [11] "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [16] "c_m 1 fm 3" "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [21] "c_m 1 fm 3" "c_m 1 fm 1" "c_m 1 fm 4" "c_m 1 fm 1" "c_m 1 fm 1"
## [26] "c_m 1 fm 3" "c_m 0 fm 2" "c_m 1 fm 3" "c_m 1 fm 2" "c_m 1 fm 3"
## [31] "c_m 0 fm 3" "c_m 1 fm 2" "c_m 1 fm 2" "c_m 1 fm 1" "c_m 1 fm 3"
## [36] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 4" "c_m 1 fm 3"
## [41] "c_m 1 fm 4" "c_m 0 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 2"
## [46] "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 0 fm 3" "c_m 1 fm 3"
## [51] "c_m 1 fm 3" "c_m 1 fm 2" "c_m 1 fm 2" "c_m 1 fm 3" "c_m 0 fm 3"
## [56] "c_m 1 fm 3" "c_m 0 fm 1" "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3"
## [61] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [66] "c_m 1 fm 3" "c_m 0 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 2"
## [71] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [76] "c_m 1 fm 3" "c_m 1 fm 4" "c_m 1 fm 3" "c_m 1 fm 1" "c_m 1 fm 3"
## [81] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 4" "c_m 1 fm 3"
## [86] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [91] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 2" "c_m 1 fm 2"
## [96] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 0 fm 2" "c_m 1 fm 3"
## [101] "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [106] "c_m 1 fm 3" "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 0 fm 2"
## [111] "c_m 0 fm 3" "c_m 1 fm 1" "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3"
## [116] "c_m 0 fm 3" "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 4"
pie(table(cm_fm))
Este mismo proceso de hallar la interacción entre variables se puede hacer pero demanda tiempo, por lo tanto se hace analisis de correspondencias
El analisis de interacción solo ofrece máximos y minimos
La distrbución bivariada no se obtiene de mezclar dos univariadas
L normalidad univariada no garantiza la normalidad bivariada
ANALISIS DE CORRESPONDECIA MULTIPLE
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
# se debe combertir la base de datos del data frame en factores para que las funciones no tomen los valores como variables continuas con valores enteros para eso se hace lo siguiente
res.mca=MCA(df2)
res.mca
## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 120 individuals, described by 6 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. of the categories"
## 4 "$var$cos2" "cos2 for the categories"
## 5 "$var$contrib" "contributions of the categories"
## 6 "$var$v.test" "v-test for the categories"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "intermediate results"
## 12 "$call$marge.col" "weights of columns"
## 13 "$call$marge.li" "weights of rows"
Cuantas dimensiones puedo extraer? grafico de sedimentación
fviz_screeplot(res.mca, addlabels = TRUE, ylim = c(0, 45))
tres dimensiones explican el 55%+/-
Se pueden detectar atipicos culitativos
Existe un unico valor en f_a = 6 por lo que puede ser un atípico
df3=df[-which(df$fa==6),]
df3a=within(df3,{
c_s=as.factor(c_s)
c_m=as.factor(c_m)
rug=as.factor(rug)
cb=as.factor(cb)
fa=as.factor(fa)
fm=as.factor(fm)
})
collapsibleTree(df3a,hierarchy=c('c_s','c_m','rug','cb','fa','fm'))
cm_fm3=paste('c_m',df3a$c_m, 'fm',df3a$fm)
cm_fm3
## [1] "c_m 1 fm 4" "c_m 1 fm 2" "c_m 1 fm 1" "c_m 1 fm 3" "c_m 1 fm 3"
## [6] "c_m 1 fm 3" "c_m 0 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 2"
## [11] "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [16] "c_m 1 fm 3" "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [21] "c_m 1 fm 3" "c_m 1 fm 1" "c_m 1 fm 4" "c_m 1 fm 1" "c_m 1 fm 1"
## [26] "c_m 1 fm 3" "c_m 0 fm 2" "c_m 1 fm 3" "c_m 1 fm 2" "c_m 1 fm 3"
## [31] "c_m 0 fm 3" "c_m 1 fm 2" "c_m 1 fm 2" "c_m 1 fm 1" "c_m 1 fm 3"
## [36] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 4" "c_m 1 fm 3"
## [41] "c_m 1 fm 4" "c_m 0 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 2"
## [46] "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 0 fm 3" "c_m 1 fm 3"
## [51] "c_m 1 fm 3" "c_m 1 fm 2" "c_m 1 fm 2" "c_m 1 fm 3" "c_m 0 fm 3"
## [56] "c_m 1 fm 3" "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [61] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [66] "c_m 0 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 2" "c_m 1 fm 3"
## [71] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [76] "c_m 1 fm 4" "c_m 1 fm 3" "c_m 1 fm 1" "c_m 1 fm 3" "c_m 1 fm 3"
## [81] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 4" "c_m 1 fm 3" "c_m 1 fm 3"
## [86] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [91] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 2" "c_m 1 fm 2" "c_m 1 fm 3"
## [96] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 0 fm 2" "c_m 1 fm 3" "c_m 1 fm 2"
## [101] "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 3"
## [106] "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 0 fm 2" "c_m 0 fm 3"
## [111] "c_m 1 fm 1" "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 0 fm 3"
## [116] "c_m 1 fm 2" "c_m 1 fm 3" "c_m 1 fm 3" "c_m 1 fm 4"
pie(table(cm_fm3))
library(factoextra)
library(FactoMineR)
# se debe combertir la base de datos del data frame en factores para que las funciones no tomen los valores como variables continuas con valores enteros para eso se hace lo siguiente
res.mca3=MCA(df3a)
En este grafico se observa que la categoria cb4 y fm4 son posibles atipcos, pero en la tabla de datos corresponde a siete muestras por lo que no se pude considerar como algo fuera de lo normal y es una agrupación que coincide en más de un atributo
fviz_screeplot(res.mca3, addlabels = TRUE, ylim = c(0, 45))
El proceso continua de descubri datos con criterio tecnico para obtener el numero de dimensiones y las agrupaciones que generen las categorias con homogeneidad en las caracteristicas
df4=df[c(which(df$fa==6),
which(df$fm==4 & df$cb==4),
which(df$fm==1 & df$cb==1),
which(df$c_s==0 & df$c_m==0)),]
# funcion para filtrar y quitar datos para el analisis
df4a=within(df4,{
c_s=as.factor(c_s)
c_m=as.factor(c_m)
rug=as.factor(rug)
cb=as.factor(cb)
fa=as.factor(fa)
fm=as.factor(fm)
})
res.mca4=MCA(df4a)
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
res.mca4
## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 22 individuals, described by 6 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. of the categories"
## 4 "$var$cos2" "cos2 for the categories"
## 5 "$var$contrib" "contributions of the categories"
## 6 "$var$v.test" "v-test for the categories"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "intermediate results"
## 12 "$call$marge.col" "weights of columns"
## 13 "$call$marge.li" "weights of rows"
fviz_screeplot(res.mca4, addlabels = TRUE, ylim = c(0, 45))
Los datos centrales no son muy informativos a veces las respuesta encontradas redefinen las categorias predichas
Componentes princiaples para categoricos
(install.packages(“Gifi”))
#datos originales
# se debe convertir la tabla de datos en una matrix
library(Gifi)
x= as.matrix(df2)
pca_cat<-princals(x)
summary(pca_cat)
##
## Loadings (cutoff = 0.1):
## Comp1 Comp2
## rug -0.665 -0.237
## cb -0.942 -0.212
## fm -0.943 -0.209
## c_s -0.259 0.860
## c_m -0.272 0.807
## fa 0.176 -0.608
##
## Importance (Variance Accounted For):
## Comp1 Comp2
## Eigenvalues 2.3912 1.9057
## VAF 39.8534 31.7618
## Cumulative VAF 39.8500 71.6200
Referencia: Modern Psychometrics with R pág 235 nonlinear PCA.
La mayor parte de la varianza se explica en el componente princial uno con las variables de rug, cb y fm
plot(pca_cat,plot.type="screeplot")
plot(pca_cat,"loadplot", main= "Biplot variables categoricas")
Los resultados demuestran que las variables de rugosidad, construccion basal y forma de la mazorca son las variables que permiten la mejor caracterización de la mazorca de cacao. Esto difiere de los resultados obtenidos con Statgraphics. Por lo que no es adecuado usar PCA convencional
##Apliacion analisis de Correspondecia muntilple a las tres variables seleccionadas
df5 <- read_excel("D:/Documentos/010 metodos multivariados/clases/Clase_19 (22-07-2021)/caract_cacao rug, cb,fm.xlsx", sheet="data")
head(df5)
## # A tibble: 6 x 3
## rug cb fm
## <dbl> <dbl> <dbl>
## 1 3 4 4
## 2 2 2 2
## 3 2 1 1
## 4 2 3 3
## 5 2 3 3
## 6 1 3 3
df5<- within(df5,{
rug= as.factor(rug)
cb= as.factor(cb)
fm= as.factor(fm)
})
df5
## # A tibble: 120 x 3
## rug cb fm
## <fct> <fct> <fct>
## 1 3 4 4
## 2 2 2 2
## 3 2 1 1
## 4 2 3 3
## 5 2 3 3
## 6 1 3 3
## 7 1 3 3
## 8 3 3 3
## 9 2 3 3
## 10 2 2 2
## # ... with 110 more rows
mca_df5= MCA(df5)
mca_df5$eig
## eigenvalue percentage of variance cumulative percentage of variance
## dim 1 7.913802e-01 2.967676e+01 29.67676
## dim 2 7.019337e-01 2.632251e+01 55.99927
## dim 3 6.664377e-01 2.499141e+01 80.99068
## dim 4 2.896063e-01 1.086024e+01 91.85092
## dim 5 2.092347e-01 7.846300e+00 99.69722
## dim 6 8.074070e-03 3.027776e-01 100.00000
## dim 7 1.055368e-30 3.957632e-29 100.00000
## dim 8 2.903043e-31 1.088641e-29 100.00000
mca_df5$var
## $coord
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## rug_1 -0.3262694 -0.86993769 0.031071644 1.62852635 0.31475405
## rug_2 -0.2903039 0.30510615 -0.011246255 -0.56459418 0.28753137
## rug_3 1.6970054 -0.03164932 0.002602925 0.03152624 -1.66866607
## cb_1 0.5399782 0.69977295 3.630739204 0.12502332 0.14436765
## cb_2 -0.4270343 1.75326222 -0.556231388 0.39609459 -0.18373515
## cb_3 -0.2510313 -0.63148338 -0.109712615 -0.14043062 -0.04559795
## cb_4 3.7769336 0.15556164 -0.909017109 0.04741400 1.01232481
## fm_1 0.5399782 0.69977295 3.630739204 0.12502332 0.14436765
## fm_2 -0.4628582 1.80446247 -0.565525850 0.39854264 -0.06291304
## fm_3 -0.2425897 -0.61721252 -0.112471277 -0.13453220 -0.08310249
## fm_4 3.7769336 0.15556164 -0.909017109 0.04741400 1.01232481
##
## $contrib
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## rug_1 0.9714899 7.786656673 1.046260e-02 66.13819853 3.4196359
## rug_2 2.2481824 2.799734899 4.006521e-03 23.23677305 8.3415723
## rug_3 18.1949671 0.007135142 5.083161e-05 0.01715957 66.5388425
## cb_1 0.8187566 1.550264045 4.395599e+01 0.11993930 0.2213572
## cb_2 1.6002132 30.411311238 3.223955e+00 3.76207342 1.1204393
## cb_3 1.7695304 12.624530263 4.013665e-01 1.51322334 0.2208231
## cb_4 35.0500832 0.067035438 2.410905e+00 0.01509387 9.5236114
## fm_1 0.8187566 1.550264045 4.395599e+01 0.11993930 0.2213572
## fm_2 1.8047598 30.924902494 3.199294e+00 3.65637106 0.1261120
## fm_3 1.6731775 12.211130324 4.270770e-01 1.40613469 0.7426377
## fm_4 35.0500832 0.067035438 2.410905e+00 0.01509387 9.5236114
##
## $cos2
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## rug_1 0.02944409 0.209325330 2.670385e-04 0.7335590420 0.0274023712
## rug_2 0.14556821 0.160791411 2.184624e-04 0.5505968300 0.1428010433
## rug_3 0.50820482 0.000176767 1.195627e-06 0.0001753948 0.4913729058
## cb_1 0.02082689 0.034977298 9.415905e-01 0.0011164880 0.0014887157
## cb_2 0.04798903 0.808928528 8.141930e-02 0.0412870857 0.0088838432
## cb_3 0.12603342 0.797542522 2.407372e-02 0.0394415166 0.0041583466
## cb_4 0.88368663 0.001499079 5.118748e-02 0.0001392621 0.0634832801
## fm_1 0.02082689 0.034977298 9.415905e-01 0.0011164880 0.0014887157
## fm_2 0.05355942 0.814021203 7.995487e-02 0.0397090591 0.0009895127
## fm_3 0.12222642 0.791206543 2.627264e-02 0.0375900482 0.0143432813
## fm_4 0.88368663 0.001499079 5.118748e-02 0.0001392621 0.0634832801
##
## $v.test
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## rug_1 -1.871856 -4.9909633 0.1782627 9.3431004 1.8057913
## rug_2 -4.162045 4.3742631 -0.1612359 -8.0945057 4.1222960
## rug_3 7.776656 -0.1450354 0.0119281 0.1444714 -7.6467886
## cb_1 1.574294 2.0401712 10.5853328 0.3645025 0.4209004
## cb_2 -2.389706 9.8113452 -3.1126993 2.2165656 -1.0281913
## cb_3 -3.872722 -9.7420511 -1.6925638 -2.1664580 -0.7034510
## cb_4 10.254692 0.4223629 -2.4680579 0.1287330 2.7485469
## fm_1 1.574294 2.0401712 10.5853328 0.3645025 0.4209004
## fm_2 -2.524593 9.8421808 -3.0845793 2.1737935 -0.3431501
## fm_3 -3.813783 -9.7032767 -1.7681753 -2.1149978 -1.3064649
## fm_4 10.254692 0.4223629 -2.4680579 0.1287330 2.7485469
##
## $eta2
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## rug 0.5084136 0.2230786 0.0002902994 0.77665584 0.49149254
## cb 0.9315791 0.9403064 0.9995008718 0.04700598 0.06958871
## fm 0.9341478 0.9424162 0.9995218550 0.04515721 0.06662273
mca_df5$eig
## eigenvalue percentage of variance cumulative percentage of variance
## dim 1 7.913802e-01 2.967676e+01 29.67676
## dim 2 7.019337e-01 2.632251e+01 55.99927
## dim 3 6.664377e-01 2.499141e+01 80.99068
## dim 4 2.896063e-01 1.086024e+01 91.85092
## dim 5 2.092347e-01 7.846300e+00 99.69722
## dim 6 8.074070e-03 3.027776e-01 100.00000
## dim 7 1.055368e-30 3.957632e-29 100.00000
## dim 8 2.903043e-31 1.088641e-29 100.00000
mca_df5$coord
## NULL
mca_df5$var$contrib
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## rug_1 0.9714899 7.786656673 1.046260e-02 66.13819853 3.4196359
## rug_2 2.2481824 2.799734899 4.006521e-03 23.23677305 8.3415723
## rug_3 18.1949671 0.007135142 5.083161e-05 0.01715957 66.5388425
## cb_1 0.8187566 1.550264045 4.395599e+01 0.11993930 0.2213572
## cb_2 1.6002132 30.411311238 3.223955e+00 3.76207342 1.1204393
## cb_3 1.7695304 12.624530263 4.013665e-01 1.51322334 0.2208231
## cb_4 35.0500832 0.067035438 2.410905e+00 0.01509387 9.5236114
## fm_1 0.8187566 1.550264045 4.395599e+01 0.11993930 0.2213572
## fm_2 1.8047598 30.924902494 3.199294e+00 3.65637106 0.1261120
## fm_3 1.6731775 12.211130324 4.270770e-01 1.40613469 0.7426377
## fm_4 35.0500832 0.067035438 2.410905e+00 0.01509387 9.5236114
El analisis de PCA para varibles categoricas permite reducir el número de variables a tres que explican casi el 81 % de la variabilidad en tres dimensiones, siendo las categorias Fm4, cb4, y rug3 las más representativas, por lo que se recomienda reducir el número de categorias de las variables