Librerias útiles

#library("devtools")
#install_github("kassambara/factoextra")
library("factoextra");library(FactoMineR)

preparando los datos

data("decathlon2")
head(decathlon2)
##           X100m Long.jump Shot.put High.jump X400m X110m.hurdle Discus
## SEBRLE    11.04      7.58    14.83      2.07 49.81        14.69  43.75
## CLAY      10.76      7.40    14.26      1.86 49.37        14.05  50.72
## BERNARD   11.02      7.23    14.25      1.92 48.93        14.99  40.87
## YURKOV    11.34      7.09    15.19      2.10 50.42        15.31  46.26
## ZSIVOCZKY 11.13      7.30    13.48      2.01 48.62        14.17  45.67
## McMULLEN  10.83      7.31    13.76      2.13 49.91        14.38  44.41
##           Pole.vault Javeline X1500m Rank Points Competition
## SEBRLE          5.02    63.19  291.7    1   8217    Decastar
## CLAY            4.92    60.15  301.5    2   8122    Decastar
## BERNARD         5.32    62.77  280.1    4   8067    Decastar
## YURKOV          4.72    63.44  276.4    5   8036    Decastar
## ZSIVOCZKY       4.42    55.37  268.0    7   8004    Decastar
## McMULLEN        4.42    56.37  285.1    8   7995    Decastar

Extraer sólo individuos activos y variables para el análisis de componentes principales:

decathlon2.active <- decathlon2[1:23, 1:10]
head(decathlon2.active[, 1:6])
##           X100m Long.jump Shot.put High.jump X400m X110m.hurdle
## SEBRLE    11.04      7.58    14.83      2.07 49.81        14.69
## CLAY      10.76      7.40    14.26      1.86 49.37        14.05
## BERNARD   11.02      7.23    14.25      1.92 48.93        14.99
## YURKOV    11.34      7.09    15.19      2.10 50.42        15.31
## ZSIVOCZKY 11.13      7.30    13.48      2.01 48.62        14.17
## McMULLEN  10.83      7.31    13.76      2.13 49.91        14.38

matriz de correlación

cor.mat <- round(cor(decathlon2.active),2)
head(cor.mat[, 1:6])
##              X100m Long.jump Shot.put High.jump X400m X110m.hurdle
## X100m         1.00     -0.76    -0.45     -0.40  0.59         0.73
## Long.jump    -0.76      1.00     0.44      0.34 -0.51        -0.59
## Shot.put     -0.45      0.44     1.00      0.53 -0.31        -0.38
## High.jump    -0.40      0.34     0.53      1.00 -0.37        -0.25
## X400m         0.59     -0.51    -0.31     -0.37  1.00         0.58
## X110m.hurdle  0.73     -0.59    -0.38     -0.25  0.58         1.00
colMeans(decathlon2.active)
##        X100m    Long.jump     Shot.put    High.jump        X400m 
##    10.999565     7.349565    14.620000     2.007391    49.433043 
## X110m.hurdle       Discus   Pole.vault     Javeline       X1500m 
##    14.533913    45.160435     4.796522    59.114783   277.877826

Visualización de la matriz de correlación

library(corrplot)
corrplot(cor.mat, type="upper", order="hclust", 
         tl.col="black", tl.srt=45)

library("PerformanceAnalytics")
chart.Correlation(decathlon2.active[, 1:6], histogram=FALSE, pch=19)

Analisis de componentes principales (método 1)

deca_pca <- princomp(covmat=cor.mat)
summary(deca_pca,loadings = TRUE)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4    Comp.5
## Standard deviation     2.0294239 1.3568859 1.1130945 0.90629539 0.8357948
## Proportion of Variance 0.4118562 0.1841139 0.1238979 0.08213713 0.0698553
## Cumulative Proportion  0.4118562 0.5959701 0.7198680 0.80200516 0.8718605
##                            Comp.6    Comp.7     Comp.8    Comp.9
## Standard deviation     0.64774627 0.5535350 0.52454606 0.4038205
## Proportion of Variance 0.04195752 0.0306401 0.02751486 0.0163071
## Cumulative Proportion  0.91381799 0.9444581 0.97197295 0.9882800
##                           Comp.10
## Standard deviation     0.34234416
## Proportion of Variance 0.01171995
## Cumulative Proportion  1.00000000
## 
## Loadings:
##              Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## X100m         0.419 -0.130 -0.272         0.231         0.186 -0.162
## Long.jump    -0.392  0.206  0.168 -0.131 -0.277         0.316 -0.742
## Shot.put     -0.362        -0.465  0.142  0.299  0.365              
## High.jump    -0.301 -0.345 -0.294  0.155 -0.476  0.450         0.155
## X400m         0.344  0.215 -0.255  0.477 -0.126        -0.500 -0.477
## X110m.hurdle  0.376        -0.403        -0.269         0.667       
## Discus       -0.366        -0.156  0.444  0.480 -0.315  0.257       
## Pole.vault    0.106  0.596        -0.372  0.272  0.503              
## Javeline     -0.210  0.287 -0.544 -0.364 -0.241 -0.545 -0.258  0.114
## X1500m               0.576  0.204  0.491 -0.322         0.187  0.389
##              Comp.9 Comp.10
## X100m         0.746  0.240 
## Long.jump     0.137  0.107 
## Shot.put     -0.212  0.599 
## High.jump     0.335 -0.335 
## X400m        -0.167 -0.123 
## X110m.hurdle -0.409 -0.107 
## Discus              -0.493 
## Pole.vault    0.121 -0.379 
## Javeline      0.102        
## X1500m        0.214  0.215

Los valores propios corresponden a la magnitud de la variación explicada por cada componente principal (pc). Los valores propios son grandes para la primera PC y pequeños para las siguientes PC. ahora haremos el gráfico del codo para ver los valores y vectores propios

#res.pca <- PCA(decathlon2.active, graph = FALSE)
#eigenvalues <- res.pca$eig
#barplot(eigenvalues[, 2], names.arg=1:nrow(eigenvalues), 
#       main = "Variances",
#       xlab = "Principal Components",
#       ylab = "Percentage of variances",
#       col ="steelblue")
# Add connected line segments to the plot
#lines(x = 1:nrow(eigenvalues), eigenvalues[, 2], 
#      type="b", pch=19, col = "red")
plot(deca_pca$sdev^2,xlab = 'número componente', ylab = 'componente varianza',
     type = 'l', main = 'diagrama de codo')

~60% de las informaciones (desviaciones) contenidas en los datos son retenidas por los dos primeros componentes principales.

plano factorial seria el siguiente

Analisis de componentes principales (método 2)

res.pca <- PCA(decathlon2.active, graph = FALSE)
eigenvalues <- res.pca$eig
head(eigenvalues)
##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1  4.1242133              41.242133                          41.24213
## comp 2  1.8385309              18.385309                          59.62744
## comp 3  1.2391403              12.391403                          72.01885
## comp 4  0.8194402               8.194402                          80.21325
## comp 5  0.7015528               7.015528                          87.22878
## comp 6  0.4228828               4.228828                          91.45760

Los valores propios corresponden a la magnitud de la variación explicada por cada componente principal (pc). Los valores propios son grandes para la primera PC y pequeños para las siguientes PC. ahora haremos el gráfico del codo para ver los valores y vectores propios

fviz_screeplot(res.pca,addlabels=TRUE, hjust = -0.3)

#require(plotly)
#ggplotly(viz, tooltip = c("colour","text", "shape","fill","long","lat"))
#t1<-fviz_pca_var(res.pca)
#ggplotly(t1)

~60% de las informaciones (desviaciones) contenidas en los datos son retenidas por los dos primeros componentes principales.

los coeficientes serían los siguientes

round(res.pca$var$coord, digits = 3)#Coordenadas de variables sobre los componentes principales
##               Dim.1  Dim.2  Dim.3  Dim.4  Dim.5
## X100m        -0.851 -0.179  0.302  0.034 -0.194
## Long.jump     0.794  0.281 -0.191 -0.115  0.233
## Shot.put      0.734  0.085  0.518  0.128 -0.249
## High.jump     0.610 -0.465  0.330  0.145  0.403
## X400m        -0.702  0.290  0.284  0.431  0.104
## X110m.hurdle -0.764 -0.025  0.449 -0.017  0.224
## Discus        0.743  0.050  0.177  0.395 -0.408
## Pole.vault   -0.217  0.807  0.094 -0.339 -0.222
## Javeline      0.428  0.386  0.604 -0.332  0.198
## X1500m        0.004  0.784 -0.219  0.448  0.263
round(res.pca$var$contrib, digits = 3)#Las contribuciones variables en la determinación de un componente principal dado son (en porcentaje)
##               Dim.1  Dim.2  Dim.3  Dim.4  Dim.5
## X100m        17.544  1.751  7.339  0.138  5.389
## Long.jump    15.293  4.290  2.930  1.625  7.749
## Shot.put     13.060  0.397 21.620  2.014  8.824
## High.jump     9.025 11.772  8.793  2.550 23.116
## X400m        11.936  4.580  6.488 22.651  1.539
## X110m.hurdle 14.158  0.033 16.261  0.035  7.166
## Discus       13.393  0.134  2.515 19.041 23.756
## Pole.vault    1.145 35.462  0.714 14.023  7.005
## Javeline      4.446  8.109 29.453 13.430  5.578
## X1500m        0.000 33.473  3.887 24.494  9.878

plano factorial

fviz_pca_ind(res.pca)

fviz_pca_var(res.pca)

fviz_pca_contrib(res.pca,choice = "var", axes = 1)
## Warning in fviz_pca_contrib(res.pca, choice = "var", axes = 1): The
## function fviz_pca_contrib() is deprecated. Please use the function
## fviz_contrib() which can handle outputs of PCA, CA and MCA functions.

fviz_pca_contrib(res.pca,choice = "var", axes = 2)
## Warning in fviz_pca_contrib(res.pca, choice = "var", axes = 2): The
## function fviz_pca_contrib() is deprecated. Please use the function
## fviz_contrib() which can handle outputs of PCA, CA and MCA functions.

fviz_pca_biplot(res.pca)