======================================================================
@ JungHwan Yun
@ Master Student in Data-Science
@ Seoul National University of Science & Technology(SeoulTech)
@ E-mail : junghwan.yun@seoultech.ac.kr
======================================================================
======================================================================
[Contents]
======================================================================
Ex. 3.3 Find the principal components of the following correlation matrix given by MacDonnell (1902) from measurements of seven physical characteristics in each of 3000 convicted criminals: How would you interpret the derived components?
cor_matrix <- matrix(c(1,0.402,0.396,0.301,0.305,0.339,0.34,
0.402,1,0.618,0.15,0.135,0.206,0.183,
0.396,0.618,1,0.321,0.289,0.363,0.345,
0.301,0.15,0.321,1,0.846,0.759,0.661,
0.305,0.135,0.289,0.846,1,0.797,0.8,
0.339,0.206,0.363,0.759,0.797,1,0.736,
0.34,0.183,0.345,0.661,0.8,0.736,1),nrow = 7,ncol = 7,byrow = T)
dimnames(cor_matrix) <- list(c("Head.Length","Head.Breadth","Face.Breadth","Left.finger.length"
,"Left.forearm.length","Left.foot.length","Height"),
c("Head.Length","Head.Breadth","Face.Breadth","Left.finger.length"
,"Left.forearm.length","Left.foot.length","Height"))
print(cor_matrix)
Head.Length Head.Breadth Face.Breadth Left.finger.length Left.forearm.length Left.foot.length Height
Head.Length 1.000 0.402 0.396 0.301 0.305 0.339 0.340
Head.Breadth 0.402 1.000 0.618 0.150 0.135 0.206 0.183
Face.Breadth 0.396 0.618 1.000 0.321 0.289 0.363 0.345
Left.finger.length 0.301 0.150 0.321 1.000 0.846 0.759 0.661
Left.forearm.length 0.305 0.135 0.289 0.846 1.000 0.797 0.800
Left.foot.length 0.339 0.206 0.363 0.759 0.797 1.000 0.736
Height 0.340 0.183 0.345 0.661 0.800 0.736 1.000
head_pca <- princomp(covmat = cor_matrix,cor = T)
print(summary(head_pca), loadings = T)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
Standard deviation 1.9492241 1.2256950 0.80610632 0.6000474 0.58237656 0.48502898 0.33751644
Proportion of Variance 0.5427821 0.2146183 0.09282963 0.0514367 0.04845178 0.03360759 0.01627391
Cumulative Proportion 0.5427821 0.7574004 0.85023003 0.9016667 0.95011851 0.98372609 1.00000000
Loadings:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
Head.Length -0.276 -0.365 0.882
Head.Breadth -0.212 -0.639 -0.258 0.687
Face.Breadth -0.295 -0.512 -0.381 -0.699 -0.101
Left.finger.length -0.438 0.235 0.102 -0.619 0.318 0.503
Left.forearm.length -0.456 0.277 0.113 0.290 -0.785
Left.foot.length -0.450 0.178 -0.870
Height -0.436 0.180 0.770 0.233 0.353
8.12. Consider the air-pollution data listed in Table1.5. Your job is to summarize these data in fewer than p = 7 dimensions if possible. Conduct a principal compoenet analysis of the data using both the covariance matrix S and the correlation matrix R. What have you learnd? Does it make any difference which matrix is chosen for analysis? Can data be summarized in three or fewer dimensions? Can you interpret the principal components?
Table1.5 : 42 measurements on air-pollution variables recorded at 12:00 noon in the LA area on different days.
air_pollution <- read.delim("D:/Google Drive/1_MASTER_Class/3_2_Multivariate_Analysis/3_Assignment/Wichern_data/T1-5.DAT", header=F, sep="")
colnames(air_pollution) <- c("Wind","Solar_radiation","CO","NO","NO_2","O_3","HC")
head(air_pollution)
S <- cov(air_pollution)
print(S)
Wind Solar_radiation CO NO NO_2 O_3 HC
Wind 2.5000000 -2.7804878 -0.3780488 -0.4634146 -0.5853659 -2.2317073 0.1707317
Solar_radiation -2.7804878 300.5156794 3.9094077 -1.3867596 6.7630662 30.7909408 0.6236934
CO -0.3780488 3.9094077 1.5220674 0.6736353 2.3147503 2.8217189 0.1416957
NO -0.4634146 -1.3867596 0.6736353 1.1823461 1.0882695 -0.8106852 0.1765389
NO_2 -0.5853659 6.7630662 2.3147503 1.0882695 11.3635308 3.1265970 1.0441347
O_3 -2.2317073 30.7909408 2.8217189 -0.8106852 3.1265970 30.9785134 0.5946574
HC 0.1707317 0.6236934 0.1416957 0.1765389 1.0441347 0.5946574 0.4785134
eigen(S)
$values
[1] 304.2578640 28.2761046 11.4644830 2.5243296 1.2795247 0.5287288 0.2096157
$vectors
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] 0.010039244 0.07622439 0.03087761 0.9203045748 0.3423859285 0.011779079 -0.169729925
[2,] -0.993199405 0.11615518 0.00659069 -0.0002118679 0.0022391022 0.003353218 -0.001781987
[3,] -0.014062314 -0.09956775 -0.18282641 -0.1382922410 0.6500776063 -0.563893916 0.443577538
[4,] 0.004710175 0.01320423 -0.13021553 -0.3277842624 0.6431560485 0.497513370 -0.462855916
[5,] -0.024255644 -0.15038113 -0.95526318 0.1023719020 -0.2065840405 -0.009009299 -0.105029951
[6,] -0.112429558 -0.97335904 0.16981025 0.0632480276 -0.0002935726 0.051067254 -0.066992404
[7,] -0.002340785 -0.02382046 -0.08519558 0.1095073458 0.0619613872 0.657012233 0.738019426
R <- cor(air_pollution)
print(R)
Wind Solar_radiation CO NO NO_2 O_3 HC
Wind 1.0000000 -0.10144191 -0.1938032 -0.26954261 -0.1098249 -0.2535928 0.15609793
Solar_radiation -0.1014419 1.00000000 0.1827934 -0.07356907 0.1157320 0.3191237 0.05201044
CO -0.1938032 0.18279338 1.0000000 0.50215246 0.5565838 0.4109288 0.16603235
NO -0.2695426 -0.07356907 0.5021525 1.00000000 0.2968981 -0.1339521 0.23470432
NO_2 -0.1098249 0.11573199 0.5565838 0.29689814 1.0000000 0.1666422 0.44776780
O_3 -0.2535928 0.31912373 0.4109288 -0.13395214 0.1666422 1.0000000 0.15445056
HC 0.1560979 0.05201044 0.1660323 0.23470432 0.4477678 0.1544506 1.00000000
eigen(R)
$values
[1] 2.3367826 1.3860007 1.2040659 0.7270865 0.6534765 0.5366888 0.1558989
$vectors
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] 0.2368211 0.278445138 0.6434744 0.172719491 0.56053441 -0.223579220 -0.24146701
[2,] -0.2055665 -0.526613869 0.2244690 0.778136601 -0.15613432 -0.005700851 -0.01126548
[3,] -0.5510839 -0.006819502 -0.1136089 0.005301798 0.57342221 -0.109538907 0.58524622
[4,] -0.3776151 0.434674253 -0.4070978 0.290503052 -0.05669070 -0.450234781 -0.46088973
[5,] -0.4980161 0.199767367 0.1965567 -0.042428178 0.05021430 0.744968707 -0.33784371
[6,] -0.3245506 -0.566973655 0.1598465 -0.507915905 0.08024349 -0.330583071 -0.41707805
[7,] -0.3194032 0.307882771 0.5410484 -0.143082348 -0.56607057 -0.266469812 0.31391372
S_pca <- princomp(x=air_pollution)
print(summary(S_pca),loadings = T)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
Standard deviation 17.234083 5.25384279 3.34537279 1.569785488 1.117613433 0.718428856 0.4523547944
Proportion of Variance 0.872948 0.08112714 0.03289281 0.007242569 0.003671092 0.001516979 0.0006014096
Cumulative Proportion 0.872948 0.95407514 0.98696795 0.994210520 0.997881611 0.999398590 1.0000000000
Loadings:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
Wind 0.920 0.342 -0.170
Solar_radiation -0.993 0.116
CO -0.183 -0.138 0.650 -0.564 0.444
NO -0.130 -0.328 0.643 0.498 -0.463
NO_2 -0.150 -0.955 0.102 -0.207 -0.105
O_3 -0.112 -0.973 0.170
HC 0.110 0.657 0.738
R_pca <- princomp(x=air_pollution,cor =T)
print(summary(R_pca),loadings = T)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
Standard deviation 1.5286539 1.1772853 1.0972994 0.8526937 0.80837896 0.73259047 0.39484041
Proportion of Variance 0.3338261 0.1980001 0.1720094 0.1038695 0.09335379 0.07666983 0.02227128
Cumulative Proportion 0.3338261 0.5318262 0.7038356 0.8077051 0.90105889 0.97772872 1.00000000
Loadings:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
Wind 0.237 0.278 0.643 0.173 0.561 -0.224 -0.241
Solar_radiation -0.206 -0.527 0.224 0.778 -0.156
CO -0.551 -0.114 0.573 -0.110 0.585
NO -0.378 0.435 -0.407 0.291 -0.450 -0.461
NO_2 -0.498 0.200 0.197 0.745 -0.338
O_3 -0.325 -0.567 0.160 -0.508 -0.331 -0.417
HC -0.319 0.308 0.541 -0.143 -0.566 -0.266 0.314
radiotherapy <- read.delim("D:/Google Drive/1_MASTER_Class/3_2_Multivariate_Analysis/3_Assignment/Wichern_data/T1-7.DAT", header=F, sep="")
colnames(radiotherapy) <- c("Symptoms","Activity","Sleep","Eat","Appetite","Skin_reaction")
head(radiotherapy)
S <- cov(radiotherapy)
print(S)
Symptoms Activity Sleep Eat Appetite Skin_reaction
Symptoms 4.6547509 0.93134537 0.58969909 0.27691531 1.074885659 0.158150852
Activity 0.9313454 0.61282116 0.11093341 0.11846905 0.388886434 -0.024851988
Sleep 0.5896991 0.11093341 0.57142886 0.08700496 0.347989910 0.110131391
Eat 0.2769153 0.11846905 0.08700496 0.11040907 0.217405649 0.021814433
Appetite 1.0748857 0.38888643 0.34798991 0.21740565 0.862172372 -0.008817694
Skin_reaction 0.1581509 -0.02485199 0.11013139 0.02181443 -0.008817694 0.861455923
R <- cor(radiotherapy)
print(R)
Symptoms Activity Sleep Eat Appetite Skin_reaction
Symptoms 1.00000000 0.55143669 0.3615773 0.38627479 0.53655840 0.07897812
Activity 0.55143669 1.00000000 0.1874625 0.45544470 0.53500626 -0.03420407
Sleep 0.36157729 0.18746250 1.0000000 0.34638617 0.49577944 0.15696886
Eat 0.38627479 0.45544470 0.3463862 1.00000000 0.70464665 0.07073348
Appetite 0.53655840 0.53500626 0.4957794 0.70464665 1.00000000 -0.01023155
Skin_reaction 0.07897812 -0.03420407 0.1569689 0.07073348 -0.01023155 1.00000000
eigen(S)
$values
[1] 5.28338056 0.88731486 0.76752442 0.43491104 0.24952575 0.05038164
$vectors
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 0.92848514 -0.01984119 0.34897158 -0.11153020 -0.05642494 0.01085087
[2,] 0.21161527 0.16899493 -0.22680309 0.71009823 0.60716267 -0.04831192
[3,] 0.14237865 -0.15094403 -0.50211919 -0.63510070 0.54883906 0.01559889
[4,] 0.06797085 0.01498160 -0.20314436 0.08586371 -0.12741820 0.96451019
[5,] 0.25882546 0.14375511 -0.72595939 0.11288932 -0.55384501 -0.25659019
[6,] 0.03538335 -0.96300206 -0.07981758 0.24464677 -0.06297391 -0.03444497
eigen(R)
$values
[1] 2.8643078 1.0764496 0.7776412 0.6503143 0.3880318 0.2432554
$vectors
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] -0.44485828 -0.02665962 -0.3393295 0.55114851 0.60085058 -0.14649150
[2,] -0.42929971 -0.29173810 -0.4986071 0.06136723 -0.68729718 -0.07640845
[3,] -0.35877306 0.38013490 0.6281571 0.42105976 -0.33183937 -0.21163500
[4,] -0.46285372 -0.02095878 0.1245847 -0.66560356 0.20741295 -0.53268901
[5,] -0.52127626 -0.07369010 0.2033387 -0.20052634 0.10317525 0.79412736
[6,] -0.05587718 0.87396001 -0.4298804 -0.17871526 -0.05308994 0.11626161
R_pca <- princomp(x=radiotherapy,cor =T)
print(summary(R_pca),loadings = T)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
Standard deviation 1.6924266 1.0375209 0.8818396 0.8064207 0.62292196 0.49320928
Proportion of Variance 0.4773846 0.1794083 0.1296069 0.1083857 0.06467196 0.04054257
Cumulative Proportion 0.4773846 0.6567929 0.7863998 0.8947855 0.95945743 1.00000000
Loadings:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
Symptoms -0.445 -0.339 0.551 -0.601 -0.146
Activity -0.429 -0.292 -0.499 0.687
Sleep -0.359 0.380 0.628 0.421 0.332 -0.212
Eat -0.463 0.125 -0.666 -0.207 -0.533
Appetite -0.521 0.203 -0.201 -0.103 0.794
Skin_reaction 0.874 -0.430 -0.179 0.116
plot(R_pca)
print(R) #correlation
Symptoms Activity Sleep Eat Appetite Skin_reaction
Symptoms 1.00000000 0.55143669 0.3615773 0.38627479 0.53655840 0.07897812
Activity 0.55143669 1.00000000 0.1874625 0.45544470 0.53500626 -0.03420407
Sleep 0.36157729 0.18746250 1.0000000 0.34638617 0.49577944 0.15696886
Eat 0.38627479 0.45544470 0.3463862 1.00000000 0.70464665 0.07073348
Appetite 0.53655840 0.53500626 0.4957794 0.70464665 1.00000000 -0.01023155
Skin_reaction 0.07897812 -0.03420407 0.1569689 0.07073348 -0.01023155 1.00000000
print(R_pca$loadings) #PCA
Loadings:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
Symptoms -0.445 -0.339 0.551 -0.601 -0.146
Activity -0.429 -0.292 -0.499 0.687
Sleep -0.359 0.380 0.628 0.421 0.332 -0.212
Eat -0.463 0.125 -0.666 -0.207 -0.533
Appetite -0.521 0.203 -0.201 -0.103 0.794
Skin_reaction 0.874 -0.430 -0.179 0.116
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
SS loadings 1.000 1.000 1.000 1.000 1.000 1.000
Proportion Var 0.167 0.167 0.167 0.167 0.167 0.167
Cumulative Var 0.167 0.333 0.500 0.667 0.833 1.000
Records <- read.delim("D:/Google Drive/1_MASTER_Class/3_2_Multivariate_Analysis/3_Assignment/Wichern_data/T1-9.DAT", header=F, sep="",stringsAsFactors = F)
colnames(Records) <- c("Country","100m(s)","200m(s)","400m(s)","800m(m)","1500m(m)","3000m(m)","Marathon(m)")
head(Records)
Country <- Records$Country
Records_subset <- Records[,2:8]
Record_cor <- cor(Records_subset)
print(Record_cor)
100m(s) 200m(s) 400m(s) 800m(m) 1500m(m) 3000m(m) Marathon(m)
100m(s) 1.0000000 0.9410886 0.8707802 0.8091758 0.7815510 0.7278784 0.6689597
200m(s) 0.9410886 1.0000000 0.9088096 0.8198258 0.8013282 0.7318546 0.6799537
400m(s) 0.8707802 0.9088096 1.0000000 0.8057904 0.7197996 0.6737991 0.6769384
800m(m) 0.8091758 0.8198258 0.8057904 1.0000000 0.9050509 0.8665732 0.8539900
1500m(m) 0.7815510 0.8013282 0.7197996 0.9050509 1.0000000 0.9733801 0.7905565
3000m(m) 0.7278784 0.7318546 0.6737991 0.8665732 0.9733801 1.0000000 0.7987302
Marathon(m) 0.6689597 0.6799537 0.6769384 0.8539900 0.7905565 0.7987302 1.0000000
eigen(Record_cor)
$values
[1] 5.80762446 0.62869342 0.27933457 0.12455472 0.09097174 0.05451882 0.01430226
$vectors
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] -0.3777657 -0.4071756 -0.1405803 0.58706293 -0.16706891 0.53969730 0.08893934
[2,] -0.3832103 -0.4136291 -0.1007833 0.19407501 0.09350016 -0.74493139 -0.26565662
[3,] -0.3680361 -0.4593531 0.2370255 -0.64543118 0.32727328 0.24009405 0.12660435
[4,] -0.3947810 0.1612459 0.1475424 -0.29520804 -0.81905467 -0.01650651 -0.19521315
[5,] -0.3892610 0.3090877 -0.4219855 -0.06669044 0.02613100 -0.18898771 0.73076817
[6,] -0.3760945 0.4231899 -0.4060627 -0.08015699 0.35169796 0.24049968 -0.57150644
[7,] -0.3552031 0.3892153 0.7410610 0.32107640 0.24700821 -0.04826992 0.08208401
Record_PCA <- prcomp(Records_subset, scale = T)
summary(Record_PCA)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 2.4099 0.79290 0.5285 0.35292 0.3016 0.23349 0.11959
Proportion of Variance 0.8297 0.08981 0.0399 0.01779 0.0130 0.00779 0.00204
Cumulative Proportion 0.8297 0.91947 0.9594 0.97717 0.9902 0.99796 1.00000
print(Record_PCA)
Standard deviations:
[1] 2.4099013 0.7929019 0.5285211 0.3529231 0.3016152 0.2334927 0.1195921
Rotation:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
100m(s) -0.3777657 0.4071756 0.1405803 -0.58706293 0.16706891 -0.53969730 -0.08893934
200m(s) -0.3832103 0.4136291 0.1007833 -0.19407501 -0.09350016 0.74493139 0.26565662
400m(s) -0.3680361 0.4593531 -0.2370255 0.64543118 -0.32727328 -0.24009405 -0.12660435
800m(m) -0.3947810 -0.1612459 -0.1475424 0.29520804 0.81905467 0.01650651 0.19521315
1500m(m) -0.3892610 -0.3090877 0.4219855 0.06669044 -0.02613100 0.18898771 -0.73076817
3000m(m) -0.3760945 -0.4231899 0.4060627 0.08015699 -0.35169796 -0.24049968 0.57150644
Marathon(m) -0.3552031 -0.3892153 -0.7410610 -0.32107640 -0.24700821 0.04826992 -0.08208401
plot(Record_PCA)
library(plyr)
a1 <- Record_PCA$rotation[,1]
center <- Record_PCA$center
scale <- Record_PCA$scale
hm <- as.matrix(Records_subset)
PCA_score <- drop(scale(hm,center = center,scale = scale) %*% a1)
PCA_score<- data.frame(PCA_score)
PCA_score<- cbind(Country,PCA_score)
colnames(PCA_score) <- c("Country","Score")
arrange(PCA_score,desc(Score))