# <R 2.1> <자료 2.1>의 자료 중 X1, X2, X3, X4, X5의 상관계수행렬
econo1992 <- read.csv("../Data2022/econo1992.csv")
head(econo1992[, c(2:6)])
상관계수행렬
round(cor(econo1992[, c(2:6)]), 3)
x1 x2 x3 x4 x5
x1 1.000 0.988 0.993 0.984 0.996
x2 0.988 1.000 0.976 0.962 0.984
x3 0.993 0.976 1.000 0.997 0.997
x4 0.984 0.962 0.997 1.000 0.987
x5 0.996 0.984 0.997 0.987 1.000
summary(heptathlon)
hurdles highjump shot run200m longjump javelin run800m score
Min. :12.69 Min. :1.500 Min. :10.00 Min. :22.56 Min. :4.880 Min. :35.68 Min. :124.2 Min. :4566
1st Qu.:13.47 1st Qu.:1.770 1st Qu.:12.32 1st Qu.:23.92 1st Qu.:6.050 1st Qu.:39.06 1st Qu.:132.2 1st Qu.:5746
Median :13.75 Median :1.800 Median :12.88 Median :24.83 Median :6.250 Median :40.28 Median :134.7 Median :6137
Mean :13.84 Mean :1.782 Mean :13.12 Mean :24.65 Mean :6.152 Mean :41.48 Mean :136.1 Mean :6091
3rd Qu.:14.07 3rd Qu.:1.830 3rd Qu.:14.20 3rd Qu.:25.23 3rd Qu.:6.370 3rd Qu.:44.54 3rd Qu.:138.5 3rd Qu.:6351
Max. :16.42 Max. :1.860 Max. :16.23 Max. :26.61 Max. :7.270 Max. :47.50 Max. :163.4 Max. :7291
write.csv(heptathlon, file = "./heptathlon.csv")
변수들 중 hurdles, run200m, run800m은 작은 값일 수록 좋은 점수이기 때문에 자료를 변형시켜 준다. 즉, 높은 수가 좋은 점수가 되도록 최댓값에서 빼 줌으로써 자료를 역변환한다.
heptathlon$hurdles = max(heptathlon$hurdles) - heptathlon$hurdles
heptathlon$run200m = max(heptathlon$run200m) - heptathlon$run200m
heptathlon$run800m = max(heptathlon$run800m) - heptathlon$run800m
heptathlon
# 공분산행렬
round(cov(heptathlon[, -8]), 2)
hurdles highjump shot run200m longjump javelin run800m
hurdles 0.54 0.05 0.72 0.55 0.32 0.02 4.76
highjump 0.05 0.01 0.05 0.04 0.03 0.00 0.38
shot 0.72 0.05 2.23 0.99 0.53 1.42 5.19
run200m 0.55 0.04 0.99 0.94 0.38 1.14 4.96
longjump 0.32 0.03 0.53 0.38 0.22 0.11 2.75
javelin 0.02 0.00 1.42 1.14 0.11 12.57 -0.59
run800m 4.76 0.38 5.19 4.96 2.75 -0.59 68.74
# 상관계수행렬
round(cor(heptathlon), 2)
hurdles highjump shot run200m longjump javelin run800m score
hurdles 1.00 0.81 0.65 0.77 0.91 0.01 0.78 0.92
highjump 0.81 1.00 0.44 0.49 0.78 0.00 0.59 0.77
shot 0.65 0.44 1.00 0.68 0.74 0.27 0.42 0.80
run200m 0.77 0.49 0.68 1.00 0.82 0.33 0.62 0.86
longjump 0.91 0.78 0.74 0.82 1.00 0.07 0.70 0.95
javelin 0.01 0.00 0.27 0.33 0.07 1.00 -0.02 0.25
run800m 0.78 0.59 0.42 0.62 0.70 -0.02 1.00 0.77
score 0.92 0.77 0.80 0.86 0.95 0.25 0.77 1.00
names()는 princomp() 결과의 개체 이름을 보여준다. 이 개체들을 통해 주성분분석 결과를 알 수 있다. 주성분분석 결과에서 총 7개 주성분의 표준편차를 보여준다.
# <R 2.4> PCA
hep_data <- heptathlon[, -8]
hep_pca <- princomp(hep_data, cor=T, scores=T)
names(hep_pca)
[1] "sdev" "loadings" "center" "scale" "n.obs" "scores" "call"
hep_pca
Call:
princomp(x = hep_data, cor = T, scores = T)
Standard deviations:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
2.1119364 1.0928497 0.7218131 0.6761411 0.4952441 0.2701029 0.2213617
7 variables and 25 observations.
summary(hep_pca)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
Standard deviation 2.1119364 1.0928497 0.72181309 0.67614113 0.49524412 0.27010291 0.221361710
Proportion of Variance 0.6371822 0.1706172 0.07443059 0.06530955 0.03503811 0.01042223 0.007000144
Cumulative Proportion 0.6371822 0.8077994 0.88222998 0.94753952 0.98257763 0.99299986 1.000000000
eig_val <- hep_pca$sdev^2
eig_val
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
4.46027516 1.19432056 0.52101413 0.45716683 0.24526674 0.07295558 0.04900101
# scree plot
screeplot(hep_pca, type = "lines", pch=19, main = "Scree Plot")
# 누적분산 그림
hep_var <- hep_pca$sdev^2
hep_var
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
4.46027516 1.19432056 0.52101413 0.45716683 0.24526674 0.07295558 0.04900101
hep_var_ratio <- hep_var / sum(hep_var)
round(hep_var_ratio, 3)
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
0.637 0.171 0.074 0.065 0.035 0.010 0.007
plot(cumsum(hep_var_ratio), type = 'b', pch = 19, xlab = 'Component', ylab = 'Cumulative Proportion')
title('Variance Explained')
# 주성분계수
hep_pca$loadings[, c(1:2)]
Comp.1 Comp.2
hurdles 0.4528710 0.15792058
highjump 0.3771992 0.24807386
shot 0.3630725 -0.28940743
run200m 0.4078950 -0.26038545
longjump 0.4562318 0.05587394
javelin 0.0754090 -0.84169212
run800m 0.3749594 0.22448984
# <R 2.7> 주성분점수 및 행렬도
hep_pca$scores[, c(1:2)]
Comp.1 Comp.2
Joyner-Kersee (USA) 4.20643487 -1.26802363
John (GDR) 2.94161870 -0.53452561
Behmer (GDR) 2.70427114 -0.69275901
Sablovskaite (URS) 1.37105209 -0.70655862
Choubenkova (URS) 1.38704979 -1.78931718
Schulz (GDR) 1.06537236 0.08104469
Fleming (AUS) 1.12307639 0.33042906
Greiner (USA) 0.94221015 0.82345074
Lajbnerova (CZE) 0.54118484 -0.14933917
Bouraga (URS) 0.77548704 0.53686251
Wijnsma (HOL) 0.56773896 1.42507414
Dimitrova (BUL) 1.21091937 0.36106077
Scheider (SWI) -0.01578005 -0.82307249
Braun (FRG) -0.00385205 -0.72953750
Ruotsalainen (FIN) -0.09261899 -0.77877955
Yuping (CHN) 0.14005513 0.54831883
Hagger (GB) -0.17465745 1.77914066
Brown (USA) -0.52996001 -0.74195530
Mulliner (GB) -1.14869009 0.64788023
Hautenauve (BEL) -1.10808552 1.88531477
Kytola (FIN) -1.47689483 0.94353198
Geremias (BRA) -2.05556037 0.09495979
Hui-Ing (TAI) -2.93969248 0.67514662
Jeong-Mi (KOR) -3.03136461 0.97939889
Launa (PNG) -6.39931438 -2.89774561
biplot(hep_pca, cex = 0.7, col = c("Red", "Blue"))
title("Biplot")
summary(beer)
cost size alcohol reputat color aroma taste
Min. : 0.00 Min. : 0.00 Min. :10.00 Min. : 30.00 Min. :40.00 Min. :30.00 Min. : 50.00
1st Qu.: 10.00 1st Qu.:10.00 1st Qu.:15.00 1st Qu.: 30.00 1st Qu.:50.00 1st Qu.:40.00 1st Qu.: 65.00
Median : 15.00 Median :15.00 Median :20.00 Median : 40.00 Median :60.00 Median :60.00 Median : 85.00
Mean : 27.78 Mean :22.22 Mean :23.89 Mean : 55.56 Mean :63.89 Mean :56.11 Mean : 80.56
3rd Qu.: 25.00 3rd Qu.:30.00 3rd Qu.:30.00 3rd Qu.: 80.00 3rd Qu.:80.00 3rd Qu.:65.00 3rd Qu.: 95.00
Max. :100.00 Max. :70.00 Max. :50.00 Max. :100.00 Max. :95.00 Max. :90.00 Max. :100.00
주성분분석을 하기 전에 변수 간의 상관관계를 알아보자. cor() 함수를 이용하여 변수들 간의 상관관계를 살펴 본다.
round(cor(beer), 2)
cost size alcohol reputat color aroma taste
cost 1.00 0.88 0.88 -0.17 0.32 -0.03 0.05
size 0.88 1.00 0.82 -0.06 0.01 -0.29 -0.31
alcohol 0.88 0.82 1.00 -0.36 0.40 0.10 0.06
reputat -0.17 -0.06 -0.36 1.00 -0.52 -0.52 -0.63
color 0.32 0.01 0.40 -0.52 1.00 0.82 0.80
aroma -0.03 -0.29 0.10 -0.52 0.82 1.00 0.87
taste 0.05 -0.31 0.06 -0.63 0.80 0.87 1.00
맥주의 가격(cost), 크기(size), 알코올 함량(alcohol)의 상관계수가 높아 서로 양의 상관관계가 있을 것으로 보인다. 또한 색(color), 향기(aroma), 맛(taste)이 서로 상관계수가 높게 나타났다. 반면 평판(reputat)은 다른 변수들과 상관계수가 비교적 높지 않으며, 다른 변수들과 모두 음의 상관관계를 나타냈다. 다음은 그걸 보여주는 산점도 행렬이다.
plot(beer, pch = 1)
# <R 2.10>
beer_pca <- princomp(beer, cor = F, scores = T) # scores = T : 각 케이스의 주성분점수를 포함하라는 의미
beer_pca
Call:
princomp(x = beer, cor = F, scores = T)
Standard deviations:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
39.117489 34.783073 17.974805 8.456192 6.479111 4.381410 2.579257
7 variables and 99 observations.
summary(beer_pca)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
Standard deviation 39.1174891 34.7830731 17.9748054 8.45619163 6.47911123 4.38141009 2.579257268
Proportion of Variance 0.4778119 0.3777904 0.1008889 0.02232876 0.01310829 0.00599436 0.002077325
Cumulative Proportion 0.4778119 0.8556024 0.9564913 0.97882003 0.99192831 0.99792268 1.000000000
screeplot(beer_pca, type = "lines", pch = 19)
beer_pca$loadings[, c(1:3)]
Comp.1 Comp.2 Comp.3
cost 0.7344197 0.31868378 0.1902272
size 0.3942553 0.34017182 -0.1274959
alcohol 0.2831873 0.06888179 -0.0370581
reputat -0.3356901 0.49057105 0.7861697
color 0.2657171 -0.34379889 0.3862210
aroma 0.1398211 -0.48510439 0.3693624
taste 0.1488354 -0.42871339 0.2062207
# install.packages("ade4")
library(ade4)
data(deug)
?deug
names(deug)
[1] "tab" "result" "cent"
deug_tab <- deug$tab
head(deug_tab)
summary(deug_tab)
Algebra Analysis Proba Informatic Economy Option1 Option2 English Sport
Min. : 9.00 Min. :16.00 Min. : 2.00 Min. :10.50 Min. :34.50 Min. : 8.00 Min. : 5.00 Min. : 8.50 Min. : 0.000
1st Qu.:39.00 1st Qu.:29.00 1st Qu.:22.00 1st Qu.:21.00 1st Qu.:64.50 1st Qu.:19.75 1st Qu.:20.75 1st Qu.:18.80 1st Qu.: 8.500
Median :46.00 Median :33.00 Median :29.50 Median :25.75 Median :70.20 Median :23.00 Median :23.50 Median :21.20 Median :11.500
Mean :45.57 Mean :33.99 Mean :31.24 Mean :26.99 Mean :69.55 Mean :22.75 Mean :22.67 Mean :21.13 Mean : 9.231
3rd Qu.:52.00 3rd Qu.:40.00 3rd Qu.:41.25 3rd Qu.:30.75 3rd Qu.:76.50 3rd Qu.:26.00 3rd Qu.:26.25 3rd Qu.:23.85 3rd Qu.:12.000
Max. :72.00 Max. :58.00 Max. :65.00 Max. :54.00 Max. :90.90 Max. :34.00 Max. :30.00 Max. :31.00 Max. :15.000
round(cor(deug_tab), 3)
Algebra Analysis Proba Informatic Economy Option1 Option2 English Sport
Algebra 1.000 0.445 0.504 0.389 0.366 0.537 0.196 0.114 0.235
Analysis 0.445 1.000 0.516 0.320 0.207 0.404 0.062 -0.120 0.158
Proba 0.504 0.516 1.000 0.373 0.167 0.444 0.112 0.187 0.269
Informatic 0.389 0.320 0.373 1.000 0.076 0.250 0.086 0.131 0.062
Economy 0.366 0.207 0.167 0.076 1.000 0.371 0.339 0.406 0.178
Option1 0.537 0.404 0.444 0.250 0.371 1.000 0.204 0.094 0.259
Option2 0.196 0.062 0.112 0.086 0.339 0.204 1.000 0.021 0.077
English 0.114 -0.120 0.187 0.131 0.406 0.094 0.021 1.000 0.137
Sport 0.235 0.158 0.269 0.062 0.178 0.259 0.077 0.137 1.000
deug_tab_pca <- princomp(deug_tab, cor=T, scores=T)
names(deug_tab_pca)
[1] "sdev" "loadings" "center" "scale" "n.obs" "scores" "call"
deug_tab_pca
Call:
princomp(x = deug_tab, cor = T, scores = T)
Standard deviations:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
1.7610672 1.1674688 1.0160349 0.9664643 0.8600889 0.7580694 0.7297544 0.6614677 0.5336435
9 variables and 104 observations.
summary(deug_tab_pca)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
Standard deviation 1.7610672 1.1674688 1.0160349 0.9664643 0.86008891 0.75806943 0.72975436 0.6614677 0.53364350
Proportion of Variance 0.3445953 0.1514426 0.1147030 0.1037837 0.08219477 0.06385214 0.05917127 0.0486155 0.03164171
Cumulative Proportion 0.3445953 0.4960379 0.6107409 0.7145246 0.79671938 0.86057152 0.91974279 0.9683583 1.00000000
eigen_value <- deug_tab_pca$sdev^2
eigen_value
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
3.1013578 1.3629834 1.0323269 0.9340533 0.7397529 0.5746693 0.5325414 0.4375395 0.2847754
1보다 큰 고윳값의 수와 그 고윳값들이 확보하는 누정정보의 양을 구하시오.
Scree plot을 그리고 해석하시오.
screeplot(deug_tab_pca, type = "lines", pch=20, main = "Scree Plot")
위의 결과를 이용하여 주성분을 구하시오.
biplot을 그려 보고 주성분의 특징을 정리하시오.
biplot(deug_tab_pca, cex = 0.7, col = c("Red", "Blue"))
title("Biplot")
write.csv(deug_tab, "./deug_tab.csv")
summary(crime)
state Murder Assault UrbanPop Rape
Length:50 Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
Class :character 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
Mode :character Median : 7.250 Median :159.0 Median :66.00 Median :20.10
Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
round(cor(crime[, -1]), 2)
Murder Assault UrbanPop Rape
Murder 1.00 0.80 0.07 0.56
Assault 0.80 1.00 0.26 0.67
UrbanPop 0.07 0.26 1.00 0.41
Rape 0.56 0.67 0.41 1.00
전체적으로 변수들 간에 양의 상관관계를 보인다. (in my thought)
crime_pca
Call:
princomp(x = crime[, -1], cor = T, scores = T)
Standard deviations:
Comp.1 Comp.2 Comp.3 Comp.4
1.5748783 0.9948694 0.5971291 0.4164494
4 variables and 50 observations.
summary(crime_pca)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4
Standard deviation 1.5748783 0.9948694 0.5971291 0.41644938
Proportion of Variance 0.6200604 0.2474413 0.0891408 0.04335752
Cumulative Proportion 0.6200604 0.8675017 0.9566425 1.00000000
crime_var
Comp.1 Comp.2 Comp.3 Comp.4
2.4802416 0.9897652 0.3565632 0.1734301
round(crime_var_ratio, 3)
Comp.1 Comp.2 Comp.3 Comp.4
0.620 0.247 0.089 0.043
plot(cumsum(crime_var_ratio), type = 'b', pch = 19, xlab = 'Component', ylab = 'Cumulative Proportion')
title('Variance Explained')
# 주성분계수
crime_pca$loadings[, c(1:2)]
Comp.1 Comp.2
Murder 0.5358995 0.4181809
Assault 0.5831836 0.1879856
UrbanPop 0.2781909 -0.8728062
Rape 0.5434321 -0.1673186
# <R 2.7> 주성분점수 및 행렬도
crime_pca$scores[, c(1:2)]
Comp.1 Comp.2
[1,] 0.98556588 1.13339238
[2,] 1.95013775 1.07321326
[3,] 1.76316354 -0.74595678
[4,] -0.14142029 1.11979678
[5,] 2.52398013 -1.54293399
[6,] 1.51456286 -0.98755509
[7,] -1.35864746 -1.08892789
[8,] 0.04770931 -0.32535892
[9,] 3.01304227 0.03922851
[10,] 1.63928304 1.27894240
[11,] -0.91265715 -1.57046001
[12,] -1.63979985 0.21097292
[13,] 1.37891072 -0.68184119
[14,] -0.50546136 -0.15156254
[15,] -2.25364607 -0.10405407
[16,] -0.79688112 -0.27016470
[17,] -0.75085907 0.95844029
[18,] 1.56481798 0.87105466
[19,] -2.39682949 0.37639158
[20,] 1.76336939 0.42765519
[21,] -0.48616629 -1.47449650
[22,] 2.10844115 -0.15539682
[23,] -1.69268181 -0.63226125
[24,] 0.99649446 2.39379599
[25,] 0.69678733 -0.26335479
[26,] -1.18545191 0.53687437
[27,] -1.26563654 -0.19395373
[28,] 2.87439454 -0.77560020
[29,] -2.38391541 -0.01808229
[30,] 0.18156611 -1.44950571
[31,] 1.98002375 0.14284878
[32,] 1.68257738 -0.82318414
[33,] 1.12337861 2.22800338
[34,] -2.99222562 0.59911882
[35,] -0.22596542 -0.74223824
[36,] -0.31178286 -0.28785421
[37,] 0.05912208 -0.54141145
[38,] -0.88841582 -0.57110035
[39,] -0.86377206 -1.49197842
[40,] 1.32072380 1.93340466
[41,] -1.98777484 0.82334324
[42,] 0.99974168 0.86025130
[43,] 1.35513821 -0.41248082
[44,] -0.55056526 -1.47150461
[45,] -2.80141174 1.40228806
[46,] -0.09633491 0.19973529
[47,] -0.21690338 -0.97012418
[48,] -2.10858541 1.42484670
[49,] -2.07971417 -0.61126862
[50,] -0.62942666 0.32101297
biplot(crime_pca, cex = 0.7, col = c("Red", "Blue"))
title("Biplot")