principal component Analysis (PCA)

on university data

mydata <- read.csv(file.choose())
View(mydata)

#help(princomp) #to understand the api for princomp
#?princomp

the first column in my data has univsity name

mydata[-1] -> Considering only numerical values for applying PCA

data <- mydata[-1]
View(data)

attach(data)
cor(data)

##                 SAT      Top10     Accept    SFRatio   Expenses   GradRate
## SAT       1.0000000  0.9225222 -0.8858496 -0.8125517  0.7789760  0.7477120
## Top10     0.9225222  1.0000000 -0.8591811 -0.6434351  0.6114666  0.7459420
## Accept   -0.8858496 -0.8591811  1.0000000  0.6316636 -0.5584395 -0.8195495
## SFRatio  -0.8125517 -0.6434351  0.6316636  1.0000000 -0.7818394 -0.5609217
## Expenses  0.7789760  0.6114666 -0.5584395 -0.7818394  1.0000000  0.3935914
## GradRate  0.7477120  0.7459420 -0.8195495 -0.5609217  0.3935914  1.0000000

summary(data)

##       SAT           Top10            Accept        SFRatio     
##  Min.   :1005   Min.   : 28.00   Min.   :14.0   Min.   : 6.00  
##  1st Qu.:1240   1st Qu.: 74.00   1st Qu.:24.0   1st Qu.:11.00  
##  Median :1285   Median : 81.00   Median :36.0   Median :12.00  
##  Mean   :1266   Mean   : 76.48   Mean   :39.2   Mean   :12.72  
##  3rd Qu.:1340   3rd Qu.: 90.00   3rd Qu.:50.0   3rd Qu.:14.00  
##  Max.   :1415   Max.   :100.00   Max.   :90.0   Max.   :25.00  
##     Expenses        GradRate    
##  Min.   : 8704   Min.   :67.00  
##  1st Qu.:15140   1st Qu.:81.00  
##  Median :27553   Median :90.00  
##  Mean   :27388   Mean   :86.72  
##  3rd Qu.:34870   3rd Qu.:94.00  
##  Max.   :63575   Max.   :97.00

normdata <- scale(data)
pcaObj <- princomp(normdata, cor=TRUE) #princomp(mydata, cor = TRUE) not_same_as prcomp(mydata, scale=TRUE); similar , but different

summary(pcaObj) #prop of variance

## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4    Comp.5
## Standard deviation     2.1475766 0.8870266 0.53531473 0.40469755 0.3525708
## Proportion of Variance 0.7686808 0.1311360 0.04776031 0.02729668 0.0207177
## Cumulative Proportion  0.7686808 0.8998169 0.94757718 0.97487386 0.9955916
##                             Comp.6
## Standard deviation     0.162636495
## Proportion of Variance 0.004408438
## Cumulative Proportion  1.000000000

graph showing importance of principal components

Comp.1 having highest importance( highest variance)

pcaObj$loadings

## 
## Loadings:
##          Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## SAT       0.458         0.187  0.131         0.858
## Top10     0.427 -0.200  0.498  0.375  0.482 -0.396
## Accept   -0.424  0.321 -0.156         0.801  0.217
## SFRatio  -0.391 -0.433  0.606 -0.507         0.172
## Expenses  0.363  0.634  0.205 -0.623        -0.174
## GradRate  0.379 -0.516 -0.532 -0.439  0.338       
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.167  0.167  0.167  0.167  0.167  0.167
## Cumulative Var  0.167  0.333  0.500  0.667  0.833  1.000

pcaObj$scores #Top 3 PCA Scores which represents the whole data

##            Comp.1      Comp.2      Comp.3      Comp.4      Comp.5
##  [1,]  1.00987445 -1.06430962  0.08106631  0.05695064 -0.12875425
##  [2,]  2.82223781  2.25904458  0.83682883  0.14384464 -0.12596191
##  [3,] -1.11246577  1.63120889 -0.26678684  1.07507502 -0.19181415
##  [4,]  0.74174122 -0.04218747  0.06050086 -0.15720812 -0.57761139
##  [5,]  0.31191206 -0.63524357  0.01024052  0.17136367  0.01272613
##  [6,]  1.69669089 -0.34436328 -0.25340751  0.01256433 -0.05266060
##  [7,]  1.24682093 -0.49098366 -0.03209382 -0.20564378  0.29350534
##  [8,]  0.33874978 -0.78516859 -0.49358483  0.03985631 -0.54497862
##  [9,]  2.37415013 -0.38653888  0.11609839 -0.45336562 -0.23010830
## [10,]  1.40327739  2.11951503 -0.44282714 -0.63254327  0.23005353
## [11,]  1.72610332  0.08823712  0.17040366  0.26090191  0.23331838
## [12,]  0.45085748 -0.01113295 -0.17574605  0.23616563  0.26325070
## [13,] -0.04023814 -1.00920438 -0.49651717  0.22929876  0.44803192
## [14,] -3.23373034 -0.37458049 -0.49537282 -0.52123771 -0.63929481
## [15,]  2.23626502 -0.37179329 -0.39899365  0.40696648 -0.41676068
## [16,] -5.17299212  0.77991535 -0.38591233 -0.23221171  0.17928698
## [17,]  1.69964377 -0.30559745  0.31850785 -0.29746268 -0.16342468
## [18,] -4.57814600 -0.34759136  1.49964176 -0.45425171 -0.19114197
## [19,] -0.82260312 -0.69890615  1.42781145  0.76077880  0.18426033
## [20,]  0.09776213  0.65044645  0.10050844 -0.50009719  0.48721782
## [21,] -1.96318260 -0.22476756 -0.25588143 -0.04847410  0.82274566
## [22,]  0.54228894 -0.07958884 -0.30539348  0.13169876  0.05273991
## [23,] -0.53222092 -1.01716720 -0.42371636  0.16953571  0.35781321
## [24,] -3.54869664  0.77846167 -0.44936332  0.32367862 -0.35833256
## [25,]  2.30590032 -0.11770432  0.25398866 -0.51618337  0.05589401
##             Comp.6
##  [1,] -0.034649638
##  [2,] -0.180703168
##  [3,]  0.345679459
##  [4,]  0.109163092
##  [5,] -0.016921270
##  [6,] -0.027166160
##  [7,] -0.078011984
##  [8,] -0.155371653
##  [9,]  0.266983932
## [10,] -0.235615124
## [11,]  0.238968449
## [12,] -0.314843521
## [13,]  0.004939215
## [14,] -0.090047785
## [15,]  0.050618633
## [16,] -0.030904694
## [17,]  0.114422592
## [18,]  0.104149297
## [19,] -0.251103268
## [20,]  0.219242132
## [21,]  0.152246521
## [22,] -0.036726444
## [23,] -0.066098999
## [24,] -0.077456415
## [25,] -0.010793201

cbind used to bind the data in columnwise

considering top 3 principal component scores and binding them with mydata

mydata <- cbind(mydata,pcaObj$scores[,1:3])
View(mydata)
dim(mydata)

## [1] 25 10

preparing data for clustering (considering only pca scores as they represent the)

new_data <- mydata[,7:10]
View(new_data)
attach(new_data)

## The following object is masked from data:
## 
##     GradRate

M1 <-lm(GradRate ~ Comp.1 + Comp.2 + Comp.3, data = new_data)
M2 <-lm(GradRate ~ SAT + Top10 + Accept + SFRatio + Expenses, data= new_data)
summary(M2)

## 
## Call:
## lm(formula = GradRate ~ SAT + Top10 + Accept + SFRatio + Expenses, 
##     data = new_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.567  -2.728   1.182   3.420   7.218 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 81.4126347 65.3064777   1.247   0.2277  
## SAT          0.0180570  0.0545591   0.331   0.7443  
## Top10        0.0538322  0.1779754   0.302   0.7656  
## Accept      -0.2700443  0.1391401  -1.941   0.0673 .
## SFRatio     -0.4320340  0.5608264  -0.770   0.4506  
## Expenses    -0.0002043  0.0001526  -1.339   0.1965  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.478 on 19 degrees of freedom
## Multiple R-squared:  0.7104, Adjusted R-squared:  0.6342 
## F-statistic: 9.323 on 5 and 19 DF,  p-value: 0.0001295

cor(data)

##                 SAT      Top10     Accept    SFRatio   Expenses   GradRate
## SAT       1.0000000  0.9225222 -0.8858496 -0.8125517  0.7789760  0.7477120
## Top10     0.9225222  1.0000000 -0.8591811 -0.6434351  0.6114666  0.7459420
## Accept   -0.8858496 -0.8591811  1.0000000  0.6316636 -0.5584395 -0.8195495
## SFRatio  -0.8125517 -0.6434351  0.6316636  1.0000000 -0.7818394 -0.5609217
## Expenses  0.7789760  0.6114666 -0.5584395 -0.7818394  1.0000000  0.3935914
## GradRate  0.7477120  0.7459420 -0.8195495 -0.5609217  0.3935914  1.0000000

windows()
plot(data)

### Normalizing the data

clus_data <- mydata[,8:10]
#norm_clus <- scale(clus_data) # Scale function is used to normalize data
dist1 <- dist(clus_data, method= "euclidian") #method for finding the distance

here I am considering Euclidean distance

dist1

##            1         2         3         4         5         6         7
## 2  3.8601193                                                            
## 3  3.4483547 4.1344936                                                  
## 4  1.0569068 3.1979400 2.5190189                                        
## 5  0.8223536 3.9194244 2.6911713 0.7341626                              
## 6  1.0497201 3.0386187 3.4343013 1.0496559 1.4393519                    
## 7  0.6305962 3.2862776 3.1819830 0.6819804 0.9469201 0.5223603          
## 8  0.9265808 4.1478847 2.8277808 1.0106602 0.5263438 1.4477564 1.0602414
## 9  1.5237613 2.7783712 4.0465291 1.6692597 2.0798764 0.7728288 1.1418147
## 10 3.2505337 1.9158383 2.5687345 2.3160153 2.9975062 2.4885071 2.6472408
## 11 1.3599003 2.5215151 3.2602718 0.9990285 1.5965630 0.6063201 0.7785972
## 12 1.2196859 3.4354706 2.2692630 0.3760191 0.6658911 1.2919654 0.9404523
## 13 1.1997398 4.5445606 2.8590611 1.3626748 0.7215676 1.8756435 1.4631346
## 14 4.3377632 6.7368756 2.9283453 4.0278847 3.5909846 4.9364475 4.5059424
## 15 1.4879752 2.9651188 3.9042918 1.5979288 1.9849466 0.5595426 1.0619894
## 16 6.4689314 8.2223245 4.1505139 5.9882559 5.6783612 6.9623347 6.5539585
## 17 1.0525229 2.8471499 3.4643540 1.0264160 1.4592788 0.5732353 0.6019453
## 18 5.8096483 7.8739795 4.3642708 5.5195650 5.1199343 6.5151186 6.0247004
## 19 2.3033055 4.7311308 2.8957071 2.1789885 1.8167787 3.0494328 2.5410768
## 20 1.9423479 3.2484648 1.6004538 0.9465999 1.3065249 1.9161096 1.6250484
## 21 3.1076410 5.5012350 2.0416872 2.7294773 2.3270942 3.6618279 3.2287884
## 22 1.1565736 3.4600643 2.3804480 0.4184002 0.6793015 1.1855176 0.8604090
## 23 1.6232948 4.8554023 2.7157332 1.6757182 1.0231054 2.3344631 1.8961084
## 24 4.9454759 6.6659749 2.5876114 4.3978726 4.1369189 5.3677952 4.9782119
## 25 1.6142016 2.5010483 3.8749363 1.5778891 2.0744274 0.8245976 1.1588053
##            8         9        10        11        12        13        14
## 2                                                                       
## 3                                                                       
## 4                                                                       
## 5                                                                       
## 6                                                                       
## 7                                                                       
## 8                                                                       
## 9  2.1618219                                                            
## 10 3.0940236 2.7450497                                                  
## 11 1.7687476 0.8051869 2.1462429                                        
## 12 0.8442279 1.9812010 2.3490630 1.3251211                              
## 13 0.4402641 2.5675437 3.4460854 2.1838323 1.1576768                    
## 14 3.5959978 5.6411314 5.2654643 5.0256749 3.7162404 3.2559394          
## 15 1.9443227 0.5334319 2.6272432 0.8922479 1.8351009 2.3660664 5.4708451
## 16 5.7306527 7.6532337 6.7115637 6.9559632 5.6830988 5.4367588 2.2595527
## 17 1.6557522 0.7088582 2.5590300 0.4215930 1.3749413 2.0460981 5.0005340
## 18 5.3235599 7.0887327 6.7555329 6.4575831 5.3114033 5.0014988 2.4058813
## 19 2.2467633 3.4694950 4.0493618 2.9489953 2.1601228 2.1003382 3.1011868
## 20 1.5722635 2.5015043 2.0390552 1.7240823 0.7991747 1.7691593 3.5361839
## 21 2.3810596 4.3562592 4.1065390 3.7269990 2.4247992 2.0906846 1.3015727
## 22 0.7580811 1.9046231 2.3656388 1.2868430 0.1727842 1.1135758 3.7922861
## 23 0.9040437 3.0225957 3.6858234 2.5835878 1.4282988 0.4974037 2.7778059
## 24 4.1903621 6.0627623 5.1303522 5.3557479 4.0859221 3.9379251 1.1961717
## 25 2.2077264 0.3097479 2.5110617 0.6209371 1.9071480 2.6196177 5.5959839
##           15        16        17        18        19        20        21
## 2                                                                       
## 3                                                                       
## 4                                                                       
## 5                                                                       
## 6                                                                       
## 7                                                                       
## 8                                                                       
## 9                                                                       
## 10                                                                      
## 11                                                                      
## 12                                                                      
## 13                                                                      
## 14                                                                      
## 15                                                                      
## 16 7.4982461                                                            
## 17 0.8984168 6.9934019                                                  
## 18 7.0740087 2.2760553 6.3880737                                        
## 19 3.5778337 4.9398777 2.7833389 3.7726230                              
## 20 2.4223276 5.2947349 1.8781822 4.9817437 2.1046515                    
## 21 4.2044569 3.3658837 3.7084705 3.1519805 2.0881932 2.2672685          
## 22 1.7215398 5.7801097 1.3340926 5.4358837 2.2913934 0.9462093 2.5101625
## 23 2.8428213 4.9767150 2.4573254 4.5295859 1.9009911 1.8581242 1.6442984
## 24 5.8984239 1.6255350 5.4138607 2.4751530 3.6246361 3.6899063 1.8862022
## 25 0.7041280 7.5596975 0.6379761 6.9996138 3.3916352 2.3429652 4.3007556
##           22        23        24
## 2                               
## 3                               
## 4                               
## 5                               
## 6                               
## 7                               
## 8                               
## 9                               
## 10                              
## 11                              
## 12                              
## 13                              
## 14                              
## 15                              
## 16                              
## 17                              
## 18                              
## 19                              
## 20                              
## 21                              
## 22                              
## 23 1.4309525                    
## 24 4.1824802 3.5105650          
## 25 1.8505908 3.0533997 5.9644047

Clustering the data using hclust function –> Hiearchical

fit <- kmeans(clus_data, 4) #method here is complet linkage
#fit <- hclust(dist1,method="single")
str(fit)

## List of 9
##  $ cluster     : int [1:25] 4 3 2 4 4 3 4 4 3 3 ...
##  $ centers     : num [1:4, 1:3] -4.876 -2.465 2.033 0.304 0.216 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "1" "2" "3" "4"
##   .. ..$ : chr [1:3] "Comp.1" "Comp.2" "Comp.3"
##  $ totss       : num 142
##  $ withinss    : num [1:4] 2.59 6.53 11.89 9.56
##  $ tot.withinss: num 30.6
##  $ betweenss   : num 112
##  $ size        : int [1:4] 2 4 8 11
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

#plot(fit,hang=-1)

cuttree will not work here

#groups <- cuttree(fit1, h-3) #cutting the dendrogram for 5 clusters
fit$cluster

##  [1] 4 3 2 4 4 3 4 4 3 3 3 4 4 2 3 1 3 1 4 4 2 4 4 2 3

groups<-fit$cluster
groups <- as.data.frame(groups) #cluster numbering

final1 <- cbind(groups,mydata) #binding columnwise with original data
View(final1)
View(aggregate(final1[,-c(2,9:11)], by=list(final1$groups),FUN=mean)) #inferences can

PCA_univ_data

amit

9/7/2019

principal component Analysis (PCA)

on university data

the first column in my data has univsity name

mydata[-1] -> Considering only numerical values for applying PCA

graph showing importance of principal components

Comp.1 having highest importance( highest variance)

cbind used to bind the data in columnwise

considering top 3 principal component scores and binding them with mydata

preparing data for clustering (considering only pca scores as they represent the)

here I am considering Euclidean distance

Clustering the data using hclust function –> Hiearchical

cuttree will not work here