PCA
mydata <- read.csv("C:\\Users\\Admin\\Desktop\\SARLAKG\\R _Codes\\PCA\\Universities.csv") ## use read.csv for csv files
#View(mydata)
#help(princomp) ## to understand the api for princomp
# mydata[-1] -> Considering only numerical values for applying PCA
data <- mydata[-1]
attach(data)
cov(data)
## SAT Top10 Accept SFRatio Expenses
## SAT 11741.8400 1942.6967 -1893.63333 -358.12167 1217599.46
## Top10 1942.6967 377.6767 -329.39167 -50.86000 171413.54
## Accept -1893.6333 -329.3917 389.16667 50.68333 -158911.83
## SFRatio -358.1217 -50.8600 50.68333 16.54333 -45871.33
## Expenses 1217599.4583 171413.5417 -158911.83333 -45871.33333 208077254.33
## GradRate 733.8783 131.3067 -146.44167 -20.66500 51425.62
## GradRate
## SAT 733.87833
## Top10 131.30667
## Accept -146.44167
## SFRatio -20.66500
## Expenses 51425.62500
## GradRate 82.04333
ndata <- scale(data)
pcaObj <- princomp(data, cor = TRUE)
#pcaObj1 <- princomp(data, score = TRUE )
## princomp(mydata, score = TRUE) not_same_as prcomp(mydata, scale=TRUE); similar, but different
summary(pcaObj)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.1475766 0.8870266 0.53531473 0.40469755 0.3525708
## Proportion of Variance 0.7686808 0.1311360 0.04776031 0.02729668 0.0207177
## Cumulative Proportion 0.7686808 0.8998169 0.94757718 0.97487386 0.9955916
## Comp.6
## Standard deviation 0.162636495
## Proportion of Variance 0.004408438
## Cumulative Proportion 1.000000000
#str(pcaObj)
loadings(pcaObj)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## SAT -0.458 -0.187 0.131 0.858
## Top10 -0.427 -0.200 -0.498 0.375 0.482 -0.396
## Accept 0.424 0.321 0.156 0.801 0.217
## SFRatio 0.391 -0.433 -0.606 -0.507 0.172
## Expenses -0.363 0.634 -0.205 -0.623 -0.174
## GradRate -0.379 -0.516 0.532 -0.439 0.338
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.167 0.167 0.167 0.167 0.167 0.167
## Cumulative Var 0.167 0.333 0.500 0.667 0.833 1.000
windows()
plot(pcaObj) # graph showing importance of principal components

# Comp.1 having highest importance (highest variance)
#pcaObj$loadings
pcaObj$scores
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## [1,] -1.00987445 -1.06430962 -0.08106631 0.05695064 -0.12875425
## [2,] -2.82223781 2.25904458 -0.83682883 0.14384464 -0.12596191
## [3,] 1.11246577 1.63120889 0.26678684 1.07507502 -0.19181415
## [4,] -0.74174122 -0.04218747 -0.06050086 -0.15720812 -0.57761139
## [5,] -0.31191206 -0.63524357 -0.01024052 0.17136367 0.01272613
## [6,] -1.69669089 -0.34436328 0.25340751 0.01256433 -0.05266060
## [7,] -1.24682093 -0.49098366 0.03209382 -0.20564378 0.29350534
## [8,] -0.33874978 -0.78516859 0.49358483 0.03985631 -0.54497862
## [9,] -2.37415013 -0.38653888 -0.11609839 -0.45336562 -0.23010830
## [10,] -1.40327739 2.11951503 0.44282714 -0.63254327 0.23005353
## [11,] -1.72610332 0.08823712 -0.17040366 0.26090191 0.23331838
## [12,] -0.45085748 -0.01113295 0.17574605 0.23616563 0.26325070
## [13,] 0.04023814 -1.00920438 0.49651717 0.22929876 0.44803192
## [14,] 3.23373034 -0.37458049 0.49537282 -0.52123771 -0.63929481
## [15,] -2.23626502 -0.37179329 0.39899365 0.40696648 -0.41676068
## [16,] 5.17299212 0.77991535 0.38591233 -0.23221171 0.17928698
## [17,] -1.69964377 -0.30559745 -0.31850785 -0.29746268 -0.16342468
## [18,] 4.57814600 -0.34759136 -1.49964176 -0.45425171 -0.19114197
## [19,] 0.82260312 -0.69890615 -1.42781145 0.76077880 0.18426033
## [20,] -0.09776213 0.65044645 -0.10050844 -0.50009719 0.48721782
## [21,] 1.96318260 -0.22476756 0.25588143 -0.04847410 0.82274566
## [22,] -0.54228894 -0.07958884 0.30539348 0.13169876 0.05273991
## [23,] 0.53222092 -1.01716720 0.42371636 0.16953571 0.35781321
## [24,] 3.54869664 0.77846167 0.44936332 0.32367862 -0.35833256
## [25,] -2.30590032 -0.11770432 -0.25398866 -0.51618337 0.05589401
## Comp.6
## [1,] -0.034649638
## [2,] -0.180703168
## [3,] 0.345679459
## [4,] 0.109163092
## [5,] -0.016921270
## [6,] -0.027166160
## [7,] -0.078011984
## [8,] -0.155371653
## [9,] 0.266983932
## [10,] -0.235615124
## [11,] 0.238968449
## [12,] -0.314843521
## [13,] 0.004939215
## [14,] -0.090047785
## [15,] 0.050618633
## [16,] -0.030904694
## [17,] 0.114422592
## [18,] 0.104149297
## [19,] -0.251103268
## [20,] 0.219242132
## [21,] 0.152246521
## [22,] -0.036726444
## [23,] -0.066098999
## [24,] -0.077456415
## [25,] -0.010793201
# cbind used to bind the data in column wise
# Considering top 3 principal component scores and binding them with mydata
pcaObj$scores[,1:3]
## Comp.1 Comp.2 Comp.3
## [1,] -1.00987445 -1.06430962 -0.08106631
## [2,] -2.82223781 2.25904458 -0.83682883
## [3,] 1.11246577 1.63120889 0.26678684
## [4,] -0.74174122 -0.04218747 -0.06050086
## [5,] -0.31191206 -0.63524357 -0.01024052
## [6,] -1.69669089 -0.34436328 0.25340751
## [7,] -1.24682093 -0.49098366 0.03209382
## [8,] -0.33874978 -0.78516859 0.49358483
## [9,] -2.37415013 -0.38653888 -0.11609839
## [10,] -1.40327739 2.11951503 0.44282714
## [11,] -1.72610332 0.08823712 -0.17040366
## [12,] -0.45085748 -0.01113295 0.17574605
## [13,] 0.04023814 -1.00920438 0.49651717
## [14,] 3.23373034 -0.37458049 0.49537282
## [15,] -2.23626502 -0.37179329 0.39899365
## [16,] 5.17299212 0.77991535 0.38591233
## [17,] -1.69964377 -0.30559745 -0.31850785
## [18,] 4.57814600 -0.34759136 -1.49964176
## [19,] 0.82260312 -0.69890615 -1.42781145
## [20,] -0.09776213 0.65044645 -0.10050844
## [21,] 1.96318260 -0.22476756 0.25588143
## [22,] -0.54228894 -0.07958884 0.30539348
## [23,] 0.53222092 -1.01716720 0.42371636
## [24,] 3.54869664 0.77846167 0.44936332
## [25,] -2.30590032 -0.11770432 -0.25398866
mydata1 <- cbind(mydata,pcaObj$scores[,1:3])
#View(mydata1)
# preparing data for clustering (considering only pca scores as they represent the entire data)
clus_data <- mydata1[,c(1,8:10)]
# Normalizing the data
norm_clus <- scale(clus_data[,2:4]) # Scale function is used to normalize data
dist1 <- dist(norm_clus,method = "euclidean") # method for finding the distance
# here I am considering Euclidean distance
dist1
## 1 2 3 4 5 6 7
## 2 4.0090988
## 3 3.1949981 2.7899361
## 4 1.1362521 3.0628877 2.1192201
## 5 0.5855116 3.7177067 2.6356888 0.6899633
## 6 1.0513692 3.5376813 2.5308330 0.7945660 0.8574657
## 7 0.6750089 3.5033196 2.6149813 0.5723409 0.4618755 0.4821206
## 8 1.1380215 4.3035371 2.7811409 1.3175106 0.9369923 0.9023050 0.9953431
## 9 0.9757112 3.2127335 2.8264662 0.8424366 0.9991144 0.7450502 0.5927989
## 10 3.6496020 2.4348803 1.3084767 2.5770725 3.1929000 2.7468199 2.9807592
## 11 1.3244851 2.7363425 2.2852252 0.5127461 1.0681080 0.9111746 0.7710552
## 12 1.2803560 3.3005199 1.9563887 0.4536127 0.7714593 0.6919226 0.6942218
## 13 1.1622512 4.5490457 2.9870432 1.5190953 1.0279823 1.1684474 1.1811171
## 14 2.3328060 4.6948914 2.4536484 2.1117811 1.8857587 2.2928479 2.2168007
## 15 1.2923806 3.6922370 2.6995974 1.1422740 1.1901804 0.3640387 0.8198055
## 16 3.5829332 4.5807953 2.0889433 2.9621144 3.0382909 3.3799570 3.3119071
## 17 0.9951143 3.0310804 2.7148760 0.7061552 0.9229246 1.0476630 0.7045623
## 18 3.7239654 4.6001352 4.2107807 3.5976471 3.5369261 4.3001076 3.8662111
## 19 2.6339988 3.8223771 4.0326459 2.7016018 2.6466645 3.3080805 2.8432752
## 20 1.9395939 2.5531295 1.3893967 0.8228120 1.4330662 1.4694350 1.3868539
## 21 1.7550373 4.0365729 2.0865953 1.3780242 1.2329723 1.6749798 1.5488819
## 22 1.3148962 3.4822035 2.0361713 0.6771182 0.8494158 0.6099001 0.7483588
## 23 1.1624611 4.5564714 2.9513194 1.5109977 0.9783503 1.2975215 1.2289800
## 24 3.0677772 4.0822928 1.4947627 2.3503532 2.4996850 2.7191719 2.7085568
## 25 1.2422114 2.8435585 2.6594671 0.8010191 1.1633731 1.0012043 0.8231988
## 8 9 10 11 12 13 14
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 1.5170616
## 10 3.2463489 2.9841914
## 11 1.6758149 0.6101818 2.5131180
## 12 1.0353978 1.1078080 2.4426733 0.8671525
## 13 0.3019359 1.7157148 3.5194990 1.8997206 1.2689789
## 14 1.6918104 2.7926123 3.4748415 2.6204466 1.8246322 1.6168434
## 15 0.9939410 0.9450188 2.7791370 1.1825892 0.9945772 1.2674017 2.5018227
## 16 3.0579226 3.7895097 3.3469756 3.3952748 2.7376500 3.0708628 1.5649807
## 17 1.6957171 0.4898401 3.0225387 0.5127128 1.1174848 1.8599642 2.7001585
## 18 4.3099225 4.0589735 5.2453555 3.7978239 3.8478250 4.2625383 3.7027873
## 19 3.5577317 2.8302399 4.7377247 2.7211732 3.0869149 3.5567261 3.7052781
## 20 1.9259053 1.5464338 1.9942256 0.9766895 0.9031289 2.1351280 2.1867041
## 21 1.2943803 2.1003012 3.0300721 1.8871630 1.1358716 1.3093679 0.7453478
## 22 0.8571409 1.1868451 2.4734764 1.0413765 0.2525215 1.1168720 1.7874388
## 23 0.4898199 1.7943429 3.5756595 1.9325984 1.2814185 0.2611788 1.4283232
## 24 2.4769405 3.1668624 2.7016010 2.7675799 2.0835386 2.5433773 1.2844769
## 25 1.7947908 0.3909557 2.8112336 0.3809519 1.1613776 2.0005967 2.8895121
## 15 16 17 18 19 20 21
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16 3.6118870
## 17 1.3378810 3.5960456
## 18 4.6629081 3.6790301 3.5887360
## 19 3.6411563 4.1985727 2.3738915 1.7617109
## 20 1.7500668 2.5681987 1.3448140 3.5105994 2.8809280
## 21 1.9405583 1.8527644 1.9763149 3.4301840 3.1688962 1.4980708
## 22 0.8548808 2.7788728 1.2826353 4.0571012 3.3044406 1.1150485 1.1578252
## 23 1.4510654 2.9030985 1.8708693 4.0431687 3.4096435 2.0967320 1.1343211
## 24 2.9306431 0.7501043 3.0236749 3.8070060 4.0018495 1.9495129 1.3699146
## 25 1.2280880 3.7413082 0.3654098 3.8893188 2.6580682 1.3467565 2.1629654
## 22 23 24
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23 1.1660890
## 24 2.1098234 2.4145696
## 25 1.3028605 2.0499664 3.1259792
hcluster <- hclust(dist1,method="complete") # method here is complete linkage
windows()
plot(hcluster,hang = -1) # Displaying Dendrogram
