1) Mô tả dữ liệu:

Nguồn: Kassambara. Practical to Principal Component analysis. Kindle version, 2023.

Dữ liệu mô tả hoạt động thể thao 2 sự kiện Desctar và OlympicG:

Tải thư viện:
Trong bài sử dụng thư viện FactoMineR , và \(\color{blue}{\text{factoextra}}\) để phân tích và hiển thị dữ liệu.

library(FactoMineR)
library(factoextra)

Đọc dữ liệu:

data(decanthlon2)

Xem 6 hàng đầu

head(decathlon2)
##           X100m Long.jump Shot.put High.jump X400m X110m.hurdle Discus
## SEBRLE    11.04      7.58    14.83      2.07 49.81        14.69  43.75
## CLAY      10.76      7.40    14.26      1.86 49.37        14.05  50.72
## BERNARD   11.02      7.23    14.25      1.92 48.93        14.99  40.87
## YURKOV    11.34      7.09    15.19      2.10 50.42        15.31  46.26
## ZSIVOCZKY 11.13      7.30    13.48      2.01 48.62        14.17  45.67
## McMULLEN  10.83      7.31    13.76      2.13 49.91        14.38  44.41
##           Pole.vault Javeline X1500m Rank Points Competition
## SEBRLE          5.02    63.19  291.7    1   8217    Decastar
## CLAY            4.92    60.15  301.5    2   8122    Decastar
## BERNARD         5.32    62.77  280.1    4   8067    Decastar
## YURKOV          4.72    63.44  276.4    5   8036    Decastar
## ZSIVOCZKY       4.42    55.37  268.0    7   8004    Decastar
## McMULLEN        4.42    56.37  285.1    8   7995    Decastar

Xem cấu trúc

str(decathlon2)
## 'data.frame':    27 obs. of  13 variables:
##  $ X100m       : num  11 10.8 11 11.3 11.1 ...
##  $ Long.jump   : num  7.58 7.4 7.23 7.09 7.3 7.31 6.81 7.56 6.97 7.27 ...
##  $ Shot.put    : num  14.8 14.3 14.2 15.2 13.5 ...
##  $ High.jump   : num  2.07 1.86 1.92 2.1 2.01 2.13 1.95 1.86 1.95 1.98 ...
##  $ X400m       : num  49.8 49.4 48.9 50.4 48.6 ...
##  $ X110m.hurdle: num  14.7 14.1 15 15.3 14.2 ...
##  $ Discus      : num  43.8 50.7 40.9 46.3 45.7 ...
##  $ Pole.vault  : num  5.02 4.92 5.32 4.72 4.42 4.42 4.92 4.82 4.72 4.62 ...
##  $ Javeline    : num  63.2 60.1 62.8 63.4 55.4 ...
##  $ X1500m      : num  292 302 280 276 268 ...
##  $ Rank        : int  1 2 4 5 7 8 9 10 11 12 ...
##  $ Points      : int  8217 8122 8067 8036 8004 7995 7802 7733 7708 7651 ...
##  $ Competition : Factor w/ 2 levels "Decastar","OlympicG": 1 1 1 1 1 1 1 1 1 1 ...

Tạo bản sao:

decathlon2.active <- decathlon2[1:23, 1:10]

Xem 3 hàng đầu:

head(decathlon2.active[, 1:5],3) # head of 3 rows of 5 columns.
##         X100m Long.jump Shot.put High.jump X400m
## SEBRLE  11.04      7.58    14.83      2.07 49.81
## CLAY    10.76      7.40    14.26      1.86 49.37
## BERNARD 11.02      7.23    14.25      1.92 48.93

Xem cấu trúc:

str(decathlon2.active)
## 'data.frame':    23 obs. of  10 variables:
##  $ X100m       : num  11 10.8 11 11.3 11.1 ...
##  $ Long.jump   : num  7.58 7.4 7.23 7.09 7.3 7.31 6.81 7.56 6.97 7.27 ...
##  $ Shot.put    : num  14.8 14.3 14.2 15.2 13.5 ...
##  $ High.jump   : num  2.07 1.86 1.92 2.1 2.01 2.13 1.95 1.86 1.95 1.98 ...
##  $ X400m       : num  49.8 49.4 48.9 50.4 48.6 ...
##  $ X110m.hurdle: num  14.7 14.1 15 15.3 14.2 ...
##  $ Discus      : num  43.8 50.7 40.9 46.3 45.7 ...
##  $ Pole.vault  : num  5.02 4.92 5.32 4.72 4.42 4.42 4.92 4.82 4.72 4.62 ...
##  $ Javeline    : num  63.2 60.1 62.8 63.4 55.4 ...
##  $ X1500m      : num  292 302 280 276 268 ...

2) Phân tích dữ liệu:

Chuẩn hóa dữ liệu:

res.pca <- PCA(decathlon2.active, graph=FALSE) #Standadized data.
print(res.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 23 individuals, described by 10 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

4.4.1 eigenvales/variances:

eig.val <- get_eigenvalue(res.pca)
eig.val
##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1   4.1242133        41.242133                    41.24213
## Dim.2   1.8385309        18.385309                    59.62744
## Dim.3   1.2391403        12.391403                    72.01885
## Dim.4   0.8194402         8.194402                    80.21325
## Dim.5   0.7015528         7.015528                    87.22878
## Dim.6   0.4228828         4.228828                    91.45760
## Dim.7   0.3025817         3.025817                    94.48342
## Dim.8   0.2744700         2.744700                    97.22812
## Dim.9   0.1552169         1.552169                    98.78029
## Dim.10  0.1219710         1.219710                   100.00000

3) Hiển thị dữ liệu:

scree plot:

fviz_eig(res.pca, addlabels = TRUE, elim=c(0,50))

** 4.4.2 graph of variables:**

var <- get_pca_var(res.pca)
print(var)
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"

coordinates: Xem 6 hàng coord

head(var$coord)
##                   Dim.1       Dim.2      Dim.3       Dim.4      Dim.5
## X100m        -0.8506257 -0.17939806  0.3015564  0.03357320 -0.1944440
## Long.jump     0.7941806  0.28085695 -0.1905465 -0.11538956  0.2331567
## Shot.put      0.7339127  0.08540412  0.5175978  0.12846837 -0.2488129
## High.jump     0.6100840 -0.46521415  0.3300852  0.14455012  0.4027002
## X400m        -0.7016034  0.29017826  0.2835329  0.43082552  0.1039085
## X110m.hurdle -0.7641252 -0.02474081  0.4488873 -0.01689589  0.2242200

Xem 6 hàng cos2

head(var$cos2)
##                  Dim.1        Dim.2      Dim.3        Dim.4      Dim.5
## X100m        0.7235641 0.0321836641 0.09093628 0.0011271597 0.03780845
## Long.jump    0.6307229 0.0788806285 0.03630798 0.0133147506 0.05436203
## Shot.put     0.5386279 0.0072938636 0.26790749 0.0165041211 0.06190783
## High.jump    0.3722025 0.2164242070 0.10895622 0.0208947375 0.16216747
## X400m        0.4922473 0.0842034209 0.08039091 0.1856106269 0.01079698
## X110m.hurdle 0.5838873 0.0006121077 0.20149984 0.0002854712 0.05027463

Xem 6 hàng contrib

head(var$contrib)
##                  Dim.1      Dim.2     Dim.3       Dim.4     Dim.5
## X100m        17.544293  1.7505098  7.338659  0.13755240  5.389252
## Long.jump    15.293168  4.2904162  2.930094  1.62485936  7.748815
## Shot.put     13.060137  0.3967224 21.620432  2.01407269  8.824401
## High.jump     9.024811 11.7715838  8.792888  2.54987951 23.115504
## X400m        11.935544  4.5799296  6.487636 22.65090599  1.539012
## X110m.hurdle 14.157544  0.0332933 16.261261  0.03483735  7.166193

Xem 4 hàngcoordinates of varibles

head(var$coord, 4)
##                Dim.1       Dim.2      Dim.3      Dim.4      Dim.5
## X100m     -0.8506257 -0.17939806  0.3015564  0.0335732 -0.1944440
## Long.jump  0.7941806  0.28085695 -0.1905465 -0.1153896  0.2331567
## Shot.put   0.7339127  0.08540412  0.5175978  0.1284684 -0.2488129
## High.jump  0.6100840 -0.46521415  0.3300852  0.1445501  0.4027002

plot variables:

fviz_pca_var(res.pca, col.var = "black")

# 4.4.2.3 quality of representation

head(var$cos2, 4)
##               Dim.1       Dim.2      Dim.3      Dim.4      Dim.5
## X100m     0.7235641 0.032183664 0.09093628 0.00112716 0.03780845
## Long.jump 0.6307229 0.078880629 0.03630798 0.01331475 0.05436203
## Shot.put  0.5386279 0.007293864 0.26790749 0.01650412 0.06190783
## High.jump 0.3722025 0.216424207 0.10895622 0.02089474 0.16216747

let visualize the cos2:

library(corrplot)
corrplot(var$cos2, is.corr = FALSE)

total cos2 of variables on Dim.1 and Dim.2:

fviz_cos2(res.pca, choice = "var", axes=1:2)

Color by cos2 values: quality on the factor map:

fviz_pca_var(res.pca, col.var = "cos2", alpha.var = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE # Avoid text overlapping
)

# 4.4.2.4 contribution of variables to PCs Xem 4 hàng đầu

head(var$contrib,4)
##               Dim.1      Dim.2     Dim.3     Dim.4     Dim.5
## X100m     17.544293  1.7505098  7.338659 0.1375524  5.389252
## Long.jump 15.293168  4.2904162  2.930094 1.6248594  7.748815
## Shot.put  13.060137  0.3967224 21.620432 2.0140727  8.824401
## High.jump  9.024811 11.7715838  8.792888 2.5498795 23.115504

Let’s plot

corrplot(var$contrib, is.corr = F)

Contributions of variables to PC1

fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)

Contributions of variables to PC2

# Contributions of variables to PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)

The total contribution to PC1 and PC2 is obtained with the following R code:

fviz_contrib(res.pca, choice = "var", axes = 1:2, top = 10)

the most important plot #page 21

fviz_pca_var(res.pca, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)

```

LS0tDQp0aXRsZTogJ1BDQSBwaMOibiB0w61jaCB0aMOgbmggcGjhuqduIGNow61uaCBk4buvIGxp4buHdSBEZWNhdGhsb24nDQphdXRob3I6ICJoZW5yeSBkbyINCmZvbnQtZmFtaWx5OiAnYXJpYWwnDQpkYXRlOiAiMDEvMDcvMjAyMyINCm91dHB1dDoNCiAgaHRtbF9kb2N1bWVudDoNCiAgICBjb2RlX2Rvd25sb2FkOiB5ZXMNCiAgICBjb2RlX2ZvbGRpbmc6IGhpZGUNCiAgICBoaWdobGlnaHQ6IHB5Z21lbnRzDQogICAgdGhlbWU6IGZsYXRseQ0KICAgIHRvYzogeWVzDQogICAgdG9jX2Zsb2F0OiBubw0KICB3b3JkX2RvY3VtZW50Og0KICAgIHRvYzogeWVzDQotLS0NCg0KDQoNCmBgYHtyIHNldHVwLGluY2x1ZGU9RkFMU0V9DQprbml0cjo6b3B0c19jaHVuayRzZXQoZWNobyA9IFRSVUUsIHdhcm5pbmcgPSBGQUxTRSwgbWVzc2FnZSA9IEZBTFNFKQ0KYGBgDQoNCiMjIyA8Zm9udCBjb2xvcj0ncmVkJz4xKSBNw7QgdOG6oyBk4buvIGxp4buHdTogPC9mb250PiANCg0KDQoNCioqTmd14buTbjoqKiBLYXNzYW1iYXJhLiAqUHJhY3RpY2FsIHRvIFByaW5jaXBhbCBDb21wb25lbnQgYW5hbHlzaXMqLiBLaW5kbGUgdmVyc2lvbiwgMjAyMy4NCg0KROG7ryBsaeG7h3UgbcO0IHThuqMgaG/huqF0IMSR4buZbmcgdGjhu4MgdGhhbyAyIHPhu7Ega2nhu4duIERlc2N0YXIgdsOgIE9seW1waWNHOiANCg0KICArIDEzIGJp4bq/biAodmFyaWFibGVzKQ0KICArIDI3IHF1YW4gc8OhdCAob2JzZXJ2YXRpb25zKQ0KDQoNCioqVOG6o2kgdGjGsCB2aeG7h246KipcDQpUcm9uZyBiw6BpIHPhu60gZOG7pW5nIA0KdGjGsCB2aeG7h24gPGZvbnQgY29sb3I9J2JsdWUnPkZhY3RvTWluZVIgPC9mb250PiwgdsOgICRcY29sb3J7Ymx1ZX17XHRleHR7ZmFjdG9leHRyYX19JCDEkeG7gyBwaMOibiB0w61jaCB2w6AgaGnhu4NuIHRo4buLIGThu68gbGnhu4d1LlwNCmBgYHtyfQ0KbGlicmFyeShGYWN0b01pbmVSKQ0KbGlicmFyeShmYWN0b2V4dHJhKQ0KDQoNCmBgYA0KIA0KDQoqKsSQ4buNYyBk4buvIGxp4buHdToqKg0KDQpgYGB7cn0NCmRhdGEoZGVjYW50aGxvbjIpDQoNCg0KDQpgYGANCioqWGVtIDYgaMOgbmcgxJHhuqd1KioNCmBgYHtyfQ0KaGVhZChkZWNhdGhsb24yKQ0KYGBgDQoNCioqWGVtIGPhuqV1IHRyw7pjKioNCmBgYHtyfQ0Kc3RyKGRlY2F0aGxvbjIpDQpgYGANCg0KKipU4bqhbyBi4bqjbiBzYW86KioNCmBgYHtyfQ0KZGVjYXRobG9uMi5hY3RpdmUgPC0gZGVjYXRobG9uMlsxOjIzLCAxOjEwXQ0KDQpgYGANCg0KDQoqKlhlbSAzIGjDoG5nIMSR4bqndToqKg0KDQpgYGB7cn0NCmhlYWQoZGVjYXRobG9uMi5hY3RpdmVbLCAxOjVdLDMpICMgaGVhZCBvZiAzIHJvd3Mgb2YgNSBjb2x1bW5zLg0KDQpgYGANCioqWGVtIGPhuqV1IHRyw7pjOioqDQoNCmBgYHtyfQ0Kc3RyKGRlY2F0aGxvbjIuYWN0aXZlKQ0KDQpgYGANCiMjIyA8Zm9udCBjb2xvcj0ncmVkJz4yKSBQaMOibiB0w61jaCBk4buvIGxp4buHdTogPC9mb250PiANCg0KKipDaHXhuqluIGjDs2EgZOG7ryBsaeG7h3U6KioNCg0KYGBge3J9DQpyZXMucGNhIDwtIFBDQShkZWNhdGhsb24yLmFjdGl2ZSwgZ3JhcGg9RkFMU0UpICNTdGFuZGFkaXplZCBkYXRhLg0KcHJpbnQocmVzLnBjYSkNCmBgYA0KDQoqKjQuNC4xIGVpZ2VudmFsZXMvdmFyaWFuY2VzOioqDQpgYGB7cn0NCmVpZy52YWwgPC0gZ2V0X2VpZ2VudmFsdWUocmVzLnBjYSkNCmVpZy52YWwNCmBgYA0KDQoNCg0KDQojIyMgPGZvbnQgY29sb3I9J3JlZCc+MykgSGnhu4NuIHRo4buLIGThu68gbGnhu4d1OiA8L2ZvbnQ+IA0KDQoqKnNjcmVlIHBsb3Q6KioNCg0KYGBge3J9DQpmdml6X2VpZyhyZXMucGNhLCBhZGRsYWJlbHMgPSBUUlVFLCBlbGltPWMoMCw1MCkpDQpgYGANCg0KKiogNC40LjIgZ3JhcGggb2YgdmFyaWFibGVzOioqDQoNCmBgYHtyfQ0KDQp2YXIgPC0gZ2V0X3BjYV92YXIocmVzLnBjYSkNCnByaW50KHZhcikNCmBgYA0KDQoqKmNvb3JkaW5hdGVzOioqDQpYZW0gNiBow6BuZyBjb29yZA0KYGBge3J9DQoNCmhlYWQodmFyJGNvb3JkKQ0KDQpgYGANClhlbSA2IGjDoG5nIGNvczINCmBgYHtyfQ0KaGVhZCh2YXIkY29zMikNCg0KYGBgDQpYZW0gNiBow6BuZyBjb250cmliDQpgYGB7cn0NCg0KaGVhZCh2YXIkY29udHJpYikNCmBgYA0KDQpYZW0gNCBow6BuZ2Nvb3JkaW5hdGVzIG9mIHZhcmlibGVzDQpgYGB7cn0NCmhlYWQodmFyJGNvb3JkLCA0KQ0KYGBgDQoqKnBsb3QgdmFyaWFibGVzOioqDQoNCmBgYHtyfQ0KDQpmdml6X3BjYV92YXIocmVzLnBjYSwgY29sLnZhciA9ICJibGFjayIpDQpgYGANCg0KDQoNCioqIyA0LjQuMi4zIHF1YWxpdHkgb2YgcmVwcmVzZW50YXRpb24qKg0KDQpgYGB7cn0NCg0KaGVhZCh2YXIkY29zMiwgNCkNCmBgYA0KDQoqKmxldCB2aXN1YWxpemUgdGhlIGNvczI6KioNCg0KYGBge3J9DQoNCmxpYnJhcnkoY29ycnBsb3QpDQpjb3JycGxvdCh2YXIkY29zMiwgaXMuY29yciA9IEZBTFNFKQ0KYGBgDQoNCioqdG90YWwgY29zMiBvZiB2YXJpYWJsZXMgb24gRGltLjEgYW5kIERpbS4yOioqDQoNCmBgYHtyfQ0KDQpmdml6X2NvczIocmVzLnBjYSwgY2hvaWNlID0gInZhciIsIGF4ZXM9MToyKQ0KYGBgDQoqKkNvbG9yIGJ5IGNvczIgdmFsdWVzOiBxdWFsaXR5IG9uIHRoZSBmYWN0b3IgbWFwOioqDQoNCmBgYHtyfQ0KDQoNCmZ2aXpfcGNhX3ZhcihyZXMucGNhLCBjb2wudmFyID0gImNvczIiLCBhbHBoYS52YXIgPSAiY29zMiIsDQogICAgICAgICAgICAgZ3JhZGllbnQuY29scyA9IGMoIiMwMEFGQkIiLCAiI0U3QjgwMCIsICIjRkM0RTA3IiksIA0KICAgICAgICAgICAgIHJlcGVsID0gVFJVRSAjIEF2b2lkIHRleHQgb3ZlcmxhcHBpbmcNCikNCmBgYA0KDQoqKiMgNC40LjIuNCBjb250cmlidXRpb24gb2YgdmFyaWFibGVzIHRvIFBDcyoqDQpYZW0gNCBow6BuZyDEkeG6p3UNCmBgYHtyfQ0KDQpoZWFkKHZhciRjb250cmliLDQpDQoNCmBgYA0KTGV0J3MgcGxvdA0KYGBge3J9DQpjb3JycGxvdCh2YXIkY29udHJpYiwgaXMuY29yciA9IEYpDQpgYGANCg0KDQoNCioqQ29udHJpYnV0aW9ucyBvZiB2YXJpYWJsZXMgdG8gUEMxKioNCg0KYGBge3J9DQoNCg0KZnZpel9jb250cmliKHJlcy5wY2EsIGNob2ljZSA9ICJ2YXIiLCBheGVzID0gMSwgdG9wID0gMTApDQoNCmBgYA0KDQoqKkNvbnRyaWJ1dGlvbnMgb2YgdmFyaWFibGVzIHRvIFBDMioqDQoNCmBgYHtyfQ0KIyBDb250cmlidXRpb25zIG9mIHZhcmlhYmxlcyB0byBQQzINCmZ2aXpfY29udHJpYihyZXMucGNhLCBjaG9pY2UgPSAidmFyIiwgYXhlcyA9IDIsIHRvcCA9IDEwKQ0KYGBgDQoNCioqVGhlIHRvdGFsIGNvbnRyaWJ1dGlvbiB0byBQQzEgYW5kIFBDMiBpcyBvYnRhaW5lZCB3aXRoIHRoZSBmb2xsb3dpbmcgUiBjb2RlOiAqKg0KDQpgYGB7cn0NCg0KZnZpel9jb250cmliKHJlcy5wY2EsIGNob2ljZSA9ICJ2YXIiLCBheGVzID0gMToyLCB0b3AgPSAxMCkNCg0KYGBgDQoNCioqdGhlIG1vc3QgaW1wb3J0YW50IHBsb3QgI3BhZ2UgMjEqKg0KDQpgYGB7cn0NCg0KZnZpel9wY2FfdmFyKHJlcy5wY2EsIGNvbC52YXIgPSAiY29udHJpYiIsDQogICAgICAgICAgICAgZ3JhZGllbnQuY29scyA9IGMoIiMwMEFGQkIiLCAiI0U3QjgwMCIsICIjRkM0RTA3IikNCikNCmBgYA0KDQpgYGANCg0K