mydata <- read.csv(file.choose())
View(mydata)
#help(princomp) #to understand the api for princomp
#?princomp
data <- mydata[-1]
View(data)
attach(data)
cor(data)
## SAT Top10 Accept SFRatio Expenses GradRate
## SAT 1.0000000 0.9225222 -0.8858496 -0.8125517 0.7789760 0.7477120
## Top10 0.9225222 1.0000000 -0.8591811 -0.6434351 0.6114666 0.7459420
## Accept -0.8858496 -0.8591811 1.0000000 0.6316636 -0.5584395 -0.8195495
## SFRatio -0.8125517 -0.6434351 0.6316636 1.0000000 -0.7818394 -0.5609217
## Expenses 0.7789760 0.6114666 -0.5584395 -0.7818394 1.0000000 0.3935914
## GradRate 0.7477120 0.7459420 -0.8195495 -0.5609217 0.3935914 1.0000000
summary(data)
## SAT Top10 Accept SFRatio
## Min. :1005 Min. : 28.00 Min. :14.0 Min. : 6.00
## 1st Qu.:1240 1st Qu.: 74.00 1st Qu.:24.0 1st Qu.:11.00
## Median :1285 Median : 81.00 Median :36.0 Median :12.00
## Mean :1266 Mean : 76.48 Mean :39.2 Mean :12.72
## 3rd Qu.:1340 3rd Qu.: 90.00 3rd Qu.:50.0 3rd Qu.:14.00
## Max. :1415 Max. :100.00 Max. :90.0 Max. :25.00
## Expenses GradRate
## Min. : 8704 Min. :67.00
## 1st Qu.:15140 1st Qu.:81.00
## Median :27553 Median :90.00
## Mean :27388 Mean :86.72
## 3rd Qu.:34870 3rd Qu.:94.00
## Max. :63575 Max. :97.00
normdata <- scale(data)
pcaObj <- princomp(normdata, cor=TRUE) #princomp(mydata, cor = TRUE) not_same_as prcomp(mydata, scale=TRUE); similar , but different
summary(pcaObj) #prop of variance
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.1475766 0.8870266 0.53531473 0.40469755 0.3525708
## Proportion of Variance 0.7686808 0.1311360 0.04776031 0.02729668 0.0207177
## Cumulative Proportion 0.7686808 0.8998169 0.94757718 0.97487386 0.9955916
## Comp.6
## Standard deviation 0.162636495
## Proportion of Variance 0.004408438
## Cumulative Proportion 1.000000000
pcaObj$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## SAT 0.458 0.187 0.131 0.858
## Top10 0.427 -0.200 0.498 0.375 0.482 -0.396
## Accept -0.424 0.321 -0.156 0.801 0.217
## SFRatio -0.391 -0.433 0.606 -0.507 0.172
## Expenses 0.363 0.634 0.205 -0.623 -0.174
## GradRate 0.379 -0.516 -0.532 -0.439 0.338
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.167 0.167 0.167 0.167 0.167 0.167
## Cumulative Var 0.167 0.333 0.500 0.667 0.833 1.000
pcaObj$scores #Top 3 PCA Scores which represents the whole data
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## [1,] 1.00987445 -1.06430962 0.08106631 0.05695064 -0.12875425
## [2,] 2.82223781 2.25904458 0.83682883 0.14384464 -0.12596191
## [3,] -1.11246577 1.63120889 -0.26678684 1.07507502 -0.19181415
## [4,] 0.74174122 -0.04218747 0.06050086 -0.15720812 -0.57761139
## [5,] 0.31191206 -0.63524357 0.01024052 0.17136367 0.01272613
## [6,] 1.69669089 -0.34436328 -0.25340751 0.01256433 -0.05266060
## [7,] 1.24682093 -0.49098366 -0.03209382 -0.20564378 0.29350534
## [8,] 0.33874978 -0.78516859 -0.49358483 0.03985631 -0.54497862
## [9,] 2.37415013 -0.38653888 0.11609839 -0.45336562 -0.23010830
## [10,] 1.40327739 2.11951503 -0.44282714 -0.63254327 0.23005353
## [11,] 1.72610332 0.08823712 0.17040366 0.26090191 0.23331838
## [12,] 0.45085748 -0.01113295 -0.17574605 0.23616563 0.26325070
## [13,] -0.04023814 -1.00920438 -0.49651717 0.22929876 0.44803192
## [14,] -3.23373034 -0.37458049 -0.49537282 -0.52123771 -0.63929481
## [15,] 2.23626502 -0.37179329 -0.39899365 0.40696648 -0.41676068
## [16,] -5.17299212 0.77991535 -0.38591233 -0.23221171 0.17928698
## [17,] 1.69964377 -0.30559745 0.31850785 -0.29746268 -0.16342468
## [18,] -4.57814600 -0.34759136 1.49964176 -0.45425171 -0.19114197
## [19,] -0.82260312 -0.69890615 1.42781145 0.76077880 0.18426033
## [20,] 0.09776213 0.65044645 0.10050844 -0.50009719 0.48721782
## [21,] -1.96318260 -0.22476756 -0.25588143 -0.04847410 0.82274566
## [22,] 0.54228894 -0.07958884 -0.30539348 0.13169876 0.05273991
## [23,] -0.53222092 -1.01716720 -0.42371636 0.16953571 0.35781321
## [24,] -3.54869664 0.77846167 -0.44936332 0.32367862 -0.35833256
## [25,] 2.30590032 -0.11770432 0.25398866 -0.51618337 0.05589401
## Comp.6
## [1,] -0.034649638
## [2,] -0.180703168
## [3,] 0.345679459
## [4,] 0.109163092
## [5,] -0.016921270
## [6,] -0.027166160
## [7,] -0.078011984
## [8,] -0.155371653
## [9,] 0.266983932
## [10,] -0.235615124
## [11,] 0.238968449
## [12,] -0.314843521
## [13,] 0.004939215
## [14,] -0.090047785
## [15,] 0.050618633
## [16,] -0.030904694
## [17,] 0.114422592
## [18,] 0.104149297
## [19,] -0.251103268
## [20,] 0.219242132
## [21,] 0.152246521
## [22,] -0.036726444
## [23,] -0.066098999
## [24,] -0.077456415
## [25,] -0.010793201
mydata <- cbind(mydata,pcaObj$scores[,1:3])
View(mydata)
dim(mydata)
## [1] 25 10
new_data <- mydata[,7:10]
View(new_data)
attach(new_data)
## The following object is masked from data:
##
## GradRate
M1 <-lm(GradRate ~ Comp.1 + Comp.2 + Comp.3, data = new_data)
M2 <-lm(GradRate ~ SAT + Top10 + Accept + SFRatio + Expenses, data= new_data)
summary(M2)
##
## Call:
## lm(formula = GradRate ~ SAT + Top10 + Accept + SFRatio + Expenses,
## data = new_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.567 -2.728 1.182 3.420 7.218
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 81.4126347 65.3064777 1.247 0.2277
## SAT 0.0180570 0.0545591 0.331 0.7443
## Top10 0.0538322 0.1779754 0.302 0.7656
## Accept -0.2700443 0.1391401 -1.941 0.0673 .
## SFRatio -0.4320340 0.5608264 -0.770 0.4506
## Expenses -0.0002043 0.0001526 -1.339 0.1965
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.478 on 19 degrees of freedom
## Multiple R-squared: 0.7104, Adjusted R-squared: 0.6342
## F-statistic: 9.323 on 5 and 19 DF, p-value: 0.0001295
cor(data)
## SAT Top10 Accept SFRatio Expenses GradRate
## SAT 1.0000000 0.9225222 -0.8858496 -0.8125517 0.7789760 0.7477120
## Top10 0.9225222 1.0000000 -0.8591811 -0.6434351 0.6114666 0.7459420
## Accept -0.8858496 -0.8591811 1.0000000 0.6316636 -0.5584395 -0.8195495
## SFRatio -0.8125517 -0.6434351 0.6316636 1.0000000 -0.7818394 -0.5609217
## Expenses 0.7789760 0.6114666 -0.5584395 -0.7818394 1.0000000 0.3935914
## GradRate 0.7477120 0.7459420 -0.8195495 -0.5609217 0.3935914 1.0000000
windows()
plot(data)
### Normalizing the data
clus_data <- mydata[,8:10]
#norm_clus <- scale(clus_data) # Scale function is used to normalize data
dist1 <- dist(clus_data, method= "euclidian") #method for finding the distance
dist1
## 1 2 3 4 5 6 7
## 2 3.8601193
## 3 3.4483547 4.1344936
## 4 1.0569068 3.1979400 2.5190189
## 5 0.8223536 3.9194244 2.6911713 0.7341626
## 6 1.0497201 3.0386187 3.4343013 1.0496559 1.4393519
## 7 0.6305962 3.2862776 3.1819830 0.6819804 0.9469201 0.5223603
## 8 0.9265808 4.1478847 2.8277808 1.0106602 0.5263438 1.4477564 1.0602414
## 9 1.5237613 2.7783712 4.0465291 1.6692597 2.0798764 0.7728288 1.1418147
## 10 3.2505337 1.9158383 2.5687345 2.3160153 2.9975062 2.4885071 2.6472408
## 11 1.3599003 2.5215151 3.2602718 0.9990285 1.5965630 0.6063201 0.7785972
## 12 1.2196859 3.4354706 2.2692630 0.3760191 0.6658911 1.2919654 0.9404523
## 13 1.1997398 4.5445606 2.8590611 1.3626748 0.7215676 1.8756435 1.4631346
## 14 4.3377632 6.7368756 2.9283453 4.0278847 3.5909846 4.9364475 4.5059424
## 15 1.4879752 2.9651188 3.9042918 1.5979288 1.9849466 0.5595426 1.0619894
## 16 6.4689314 8.2223245 4.1505139 5.9882559 5.6783612 6.9623347 6.5539585
## 17 1.0525229 2.8471499 3.4643540 1.0264160 1.4592788 0.5732353 0.6019453
## 18 5.8096483 7.8739795 4.3642708 5.5195650 5.1199343 6.5151186 6.0247004
## 19 2.3033055 4.7311308 2.8957071 2.1789885 1.8167787 3.0494328 2.5410768
## 20 1.9423479 3.2484648 1.6004538 0.9465999 1.3065249 1.9161096 1.6250484
## 21 3.1076410 5.5012350 2.0416872 2.7294773 2.3270942 3.6618279 3.2287884
## 22 1.1565736 3.4600643 2.3804480 0.4184002 0.6793015 1.1855176 0.8604090
## 23 1.6232948 4.8554023 2.7157332 1.6757182 1.0231054 2.3344631 1.8961084
## 24 4.9454759 6.6659749 2.5876114 4.3978726 4.1369189 5.3677952 4.9782119
## 25 1.6142016 2.5010483 3.8749363 1.5778891 2.0744274 0.8245976 1.1588053
## 8 9 10 11 12 13 14
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 2.1618219
## 10 3.0940236 2.7450497
## 11 1.7687476 0.8051869 2.1462429
## 12 0.8442279 1.9812010 2.3490630 1.3251211
## 13 0.4402641 2.5675437 3.4460854 2.1838323 1.1576768
## 14 3.5959978 5.6411314 5.2654643 5.0256749 3.7162404 3.2559394
## 15 1.9443227 0.5334319 2.6272432 0.8922479 1.8351009 2.3660664 5.4708451
## 16 5.7306527 7.6532337 6.7115637 6.9559632 5.6830988 5.4367588 2.2595527
## 17 1.6557522 0.7088582 2.5590300 0.4215930 1.3749413 2.0460981 5.0005340
## 18 5.3235599 7.0887327 6.7555329 6.4575831 5.3114033 5.0014988 2.4058813
## 19 2.2467633 3.4694950 4.0493618 2.9489953 2.1601228 2.1003382 3.1011868
## 20 1.5722635 2.5015043 2.0390552 1.7240823 0.7991747 1.7691593 3.5361839
## 21 2.3810596 4.3562592 4.1065390 3.7269990 2.4247992 2.0906846 1.3015727
## 22 0.7580811 1.9046231 2.3656388 1.2868430 0.1727842 1.1135758 3.7922861
## 23 0.9040437 3.0225957 3.6858234 2.5835878 1.4282988 0.4974037 2.7778059
## 24 4.1903621 6.0627623 5.1303522 5.3557479 4.0859221 3.9379251 1.1961717
## 25 2.2077264 0.3097479 2.5110617 0.6209371 1.9071480 2.6196177 5.5959839
## 15 16 17 18 19 20 21
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16 7.4982461
## 17 0.8984168 6.9934019
## 18 7.0740087 2.2760553 6.3880737
## 19 3.5778337 4.9398777 2.7833389 3.7726230
## 20 2.4223276 5.2947349 1.8781822 4.9817437 2.1046515
## 21 4.2044569 3.3658837 3.7084705 3.1519805 2.0881932 2.2672685
## 22 1.7215398 5.7801097 1.3340926 5.4358837 2.2913934 0.9462093 2.5101625
## 23 2.8428213 4.9767150 2.4573254 4.5295859 1.9009911 1.8581242 1.6442984
## 24 5.8984239 1.6255350 5.4138607 2.4751530 3.6246361 3.6899063 1.8862022
## 25 0.7041280 7.5596975 0.6379761 6.9996138 3.3916352 2.3429652 4.3007556
## 22 23 24
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23 1.4309525
## 24 4.1824802 3.5105650
## 25 1.8505908 3.0533997 5.9644047
fit <- kmeans(clus_data, 4) #method here is complet linkage
#fit <- hclust(dist1,method="single")
str(fit)
## List of 9
## $ cluster : int [1:25] 4 3 2 4 4 3 4 4 3 3 ...
## $ centers : num [1:4, 1:3] -4.876 -2.465 2.033 0.304 0.216 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "1" "2" "3" "4"
## .. ..$ : chr [1:3] "Comp.1" "Comp.2" "Comp.3"
## $ totss : num 142
## $ withinss : num [1:4] 2.59 6.53 11.89 9.56
## $ tot.withinss: num 30.6
## $ betweenss : num 112
## $ size : int [1:4] 2 4 8 11
## $ iter : int 2
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
#plot(fit,hang=-1)
#groups <- cuttree(fit1, h-3) #cutting the dendrogram for 5 clusters
fit$cluster
## [1] 4 3 2 4 4 3 4 4 3 3 3 4 4 2 3 1 3 1 4 4 2 4 4 2 3
groups<-fit$cluster
groups <- as.data.frame(groups) #cluster numbering
final1 <- cbind(groups,mydata) #binding columnwise with original data
View(final1)
View(aggregate(final1[,-c(2,9:11)], by=list(final1$groups),FUN=mean)) #inferences can