# 1. Load data ------------------------------------------------------------
univ <- read.csv("Universities.csv", header = TRUE)
head(univ)
## College.Name State Public.vs.Private No.appli.rec
## 1 Alaska Pacific University AK 2 193
## 2 University of Alaska at Fairbanks AK 1 1852
## 3 University of Alaska Southeast AK 1 146
## 4 University of Alaska at Anchorage AK 1 2065
## 5 Alabama Agri. & Mech. Univ. AL 1 2817
## 6 Faulkner University AL 2 345
## No.appl.accepted No.new.stud.enrolled Per.new.stud.from.top.10
## 1 146 55 16
## 2 1427 928 NA
## 3 117 89 4
## 4 1598 1162 NA
## 5 1920 984 NA
## 6 320 179 NA
## Per.new.stud.from.top.25 No.FT.undergrad No.PT.undergrad in.state.tuition
## 1 44 249 869 7560
## 2 NA 3885 4519 1742
## 3 24 492 1849 1742
## 4 NA 6209 10537 1742
## 5 NA 3958 305 1700
## 6 27 1367 578 5600
## out.of.state.tuition room board add.fees estim.book.costs
## 1 7560 1620 2500 130 800
## 2 5226 1800 1790 155 650
## 3 5226 2514 2250 34 500
## 4 5226 2600 2520 114 580
## 5 3400 1108 1442 155 500
## 6 5600 1550 1700 300 350
## estim.personal.dollar Per.fac.wvs.PHD stud.vs.fac.ratio Graduation.rate
## 1 1500 76 11.9 15
## 2 2304 67 10.0 NA
## 3 1162 39 9.5 39
## 4 1260 48 13.7 NA
## 5 850 53 14.3 40
## 6 NA 52 32.8 55
# 2. Remove records with missing data -------------------------------------------------
univ_complete <- na.omit(univ)
# 3. Hierarchical clustering using normalised data, complete linkage and Manhattan --------
# 3.1 Remove categorical variables and ID----------------------------------------
# These are college name, state and public vs private
univ_cont <- univ_complete[, -c(1:3)]
head(univ_cont)
## No.appli.rec No.appl.accepted No.new.stud.enrolled Per.new.stud.from.top.10
## 1 193 146 55 16
## 3 146 117 89 4
## 10 805 588 287 67
## 12 608 520 127 26
## 22 4414 1500 335 30
## 26 1797 1260 938 24
## Per.new.stud.from.top.25 No.FT.undergrad No.PT.undergrad in.state.tuition
## 1 44 249 869 7560
## 3 24 492 1849 1742
## 10 88 1376 207 11660
## 12 47 538 126 8080
## 22 60 908 119 5666
## 26 35 6960 4698 2220
## out.of.state.tuition room board add.fees estim.book.costs
## 1 7560 1620 2500 130 800
## 3 5226 2514 2250 34 500
## 10 11660 2050 2430 120 400
## 12 8080 1380 2540 100 500
## 22 5666 1424 1540 418 1000
## 26 4440 1935 3240 291 750
## estim.personal.dollar Per.fac.wvs.PHD stud.vs.fac.ratio Graduation.rate
## 1 1500 76 11.9 15
## 3 1162 39 9.5 39
## 10 900 74 14.0 72
## 12 1100 63 11.4 44
## 22 1400 56 15.5 46
## 26 2200 96 6.7 33
names(univ_cont)
## [1] "No.appli.rec" "No.appl.accepted"
## [3] "No.new.stud.enrolled" "Per.new.stud.from.top.10"
## [5] "Per.new.stud.from.top.25" "No.FT.undergrad"
## [7] "No.PT.undergrad" "in.state.tuition"
## [9] "out.of.state.tuition" "room"
## [11] "board" "add.fees"
## [13] "estim.book.costs" "estim.personal.dollar"
## [15] "Per.fac.wvs.PHD" "stud.vs.fac.ratio"
## [17] "Graduation.rate"
# 3.2 Normalise data ------------------------------------------------------
univ_cont_norm <- sapply(univ_cont, scale)
head(univ_cont_norm)
## No.appli.rec No.appl.accepted No.new.stud.enrolled
## [1,] -0.7253139 -0.7656329 -0.7925715
## [2,] -0.7368529 -0.7772155 -0.7554388
## [3,] -0.5750612 -0.5890979 -0.5391950
## [4,] -0.6234268 -0.6162571 -0.7139374
## [5,] 0.3109878 -0.2248447 -0.4867723
## [6,] -0.3315143 -0.3207008 0.1717883
## Per.new.stud.from.top.10 Per.new.stud.from.top.25 No.FT.undergrad
## [1,] -0.6500683 -0.5732933 -0.7097404
## [2,] -1.2994472 -1.5573355 -0.6576975
## [3,] 2.1097921 1.5915994 -0.4683728
## [4,] -0.1089192 -0.4256870 -0.6478457
## [5,] 0.1075405 0.2139404 -0.5686035
## [6,] -0.2171490 -1.0161123 0.7275427
## No.PT.undergrad in.state.tuition out.of.state.tuition room
## [1,] 0.0462840 -0.3347297 -0.6993021 -0.8428467
## [2,] 0.6802614 -1.3893276 -1.2406234 0.4106795
## [3,] -0.3819742 0.4084555 0.2516051 -0.2399203
## [4,] -0.4343744 -0.2404720 -0.5786992 -1.1793639
## [5,] -0.4389028 -0.6780450 -1.1385749 -1.1176691
## [6,] 2.5233243 -1.3026831 -1.4229193 -0.4011680
## board add.fees estim.book.costs estim.personal.dollar
## [1,] 0.6669350 -0.6997824 1.5394532 0.2758088
## [2,] 0.2259098 -0.9695550 -0.2989446 -0.2199034
## [3,] 0.5434479 -0.7278837 -0.9117438 -0.6041537
## [4,] 0.7374990 -0.7840863 -0.2989446 -0.3108329
## [5,] -1.0266018 0.1095355 2.7650518 0.1291484
## [6,] 1.9723696 -0.2473512 1.2330536 1.3024317
## Per.fac.wvs.PHD stud.vs.fac.ratio Graduation.rate
## [1,] 0.16752615 -0.529035524 -2.7862940
## [2,] -2.05260943 -1.144600894 -1.4637549
## [3,] 0.04751883 0.009584174 0.3547362
## [4,] -0.61252148 -0.657278310 -1.1882260
## [5,] -1.03254714 0.394312530 -1.0780144
## [6,] 1.36759945 -1.862760492 -1.7943897
# 3.3 Compute normalised distance using manhattan distance ----------------
# create the distance matrix using the following syntax
distance_matrix <- dist(univ_cont_norm, method = "manhattan")
#as.matrix(distance_matrix)
# 3.4 Create hierarchical clusters -----------------------------------------
# Use the following syntax
hierarchical_cluster <- hclust(distance_matrix, method = "complete")
# plot the dendrogram using the following syntax
plot(hierarchical_cluster, hang = "-200", ann = TRUE)
# check cluster quality
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(univ_cont_norm,
hcut, method = "silhouette") +
labs(subtitle = "Silhouette method")
# 3.5 Compute cluster membership using k = ??? ------------------------------
# create a vector of memberships using the following syntax
univ_hc_m_memb <- cutree(hierarchical_cluster, k = 3)
head(univ_hc_m_memb)
## [1] 1 1 1 1 1 1
# 3.6 Create data frame from cluster membership ---------------------------
# use as.data.frame()
# change the column name if preferable
cluster_df <- as.data.frame(univ_hc_m_memb)
names(cluster_df)[1] <- "Cluster"
names(cluster_df)
## [1] "Cluster"
# 3.7 Merge with original data --------------------------------------------
# Merge with original data
# use cbind()
univ_complete_w_clusters <- cbind(univ_complete, cluster_df)
# if desired, this can be done in a different application.
# just export the merged data to csv
# write.csv(univ_complete_w_cluster, "univ_complete_w_cluster.csv")
head(univ_complete_w_clusters, 50)
## College.Name State Public.vs.Private No.appli.rec
## 1 Alaska Pacific University AK 2 193
## 3 University of Alaska Southeast AK 1 146
## 10 Birmingham-Southern College AL 2 805
## 12 Huntingdon College AL 2 608
## 22 Talladega College AL 2 4414
## 26 University of Alabama at Birmingham AL 1 1797
## 32 Arkansas College (Lyon College) AR 2 708
## 38 Hendrix College AR 2 823
## 39 John Brown University AR 2 605
## 46 Harding University AR 2 1721
## 49 Northern Arizona University AZ 1 5891
## 50 University of Arizona AZ 1 14079
## 63 California Polytechnic-San Luis CA 1 7811
## 77 Claremont McKenna College CA 2 1860
## 78 Harvey Mudd College CA 2 1377
## 79 Pitzer College CA 2 1133
## 81 Scripps College CA 2 855
## 90 Occidental College CA 2 2324
## 92 Fresno Pacific College CA 2 346
## 95 Pepperdine University CA 2 3821
## 96 Southern California College CA 2 385
## 97 St. Mary's College of California CA 2 2643
## 108 University of San Francisco CA 2 2306
## 110 University of Southern California CA 2 12229
## 112 Westmont College CA 1 950
## 120 Loyola Marymount University CA 2 3768
## 121 Concordia University CA 2 688
## 122 Adams State College CO 1 1508
## 123 Colorado College CO 2 3207
## 126 Fort Lewis College CO 1 3440
## 127 Mesa State College CO 1 1584
## 130 University of Southern Colorado CO 1 1401
## 134 Western State College of Colorado CO 1 2702
## 139 Central Connecticut State University CT 1 4158
## 140 Connecticut College CT 2 3035
## 146 Sacred Heart University CT 2 2307
## 148 Saint Joseph College CT 2 292
## 149 Trinity College CT 2 3058
## 151 University of Bridgeport CT 2 598
## 152 University of Hartford CT 2 5081
## 153 Wesleyan University CT 2 4772
## 154 Eastern Connecticut State University CT 1 2172
## 156 University of Connecticut at Storrs CT 1 9735
## 158 Catholic University of America DC 2 1754
## 160 George Washington University DC 2 7875
## 161 Georgetown University DC 2 11115
## 164 Trinity College DC 2 247
## 168 University of Delaware DE 2 14446
## 169 Wesley College DE 2 980
## 172 Bethune Cookman College FL 2 1646
## No.appl.accepted No.new.stud.enrolled Per.new.stud.from.top.10
## 1 146 55 16
## 3 117 89 4
## 10 588 287 67
## 12 520 127 26
## 22 1500 335 30
## 26 1260 938 24
## 32 334 166 46
## 38 721 274 52
## 39 405 284 24
## 46 1068 806 35
## 49 4931 1973 23
## 50 12238 4529 29
## 63 3817 1650 47
## 77 767 227 71
## 78 572 178 95
## 79 630 220 37
## 81 632 139 60
## 90 1319 370 52
## 92 274 146 51
## 95 2037 680 86
## 96 340 193 18
## 97 1611 465 36
## 108 1721 538 23
## 110 8498 2477 45
## 112 713 351 42
## 120 2662 753 42
## 121 497 144 30
## 122 1259 569 16
## 123 1577 490 56
## 126 2823 1123 16
## 127 1456 891 6
## 130 1239 605 10
## 134 1623 604 7
## 139 2532 902 6
## 140 1546 438 42
## 146 1896 509 19
## 148 241 96 20
## 149 1798 478 46
## 151 441 153 16
## 152 4040 1194 11
## 153 1973 712 60
## 154 1493 564 14
## 156 7187 2064 23
## 158 1465 505 24
## 160 5062 1492 38
## 161 2881 1390 71
## 164 189 100 19
## 168 10516 3252 22
## 169 807 350 10
## 172 1150 542 12
## Per.new.stud.from.top.25 No.FT.undergrad No.PT.undergrad in.state.tuition
## 1 44 249 869 7560
## 3 24 492 1849 1742
## 10 88 1376 207 11660
## 12 47 538 126 8080
## 22 60 908 119 5666
## 26 35 6960 4698 2220
## 32 74 530 182 8644
## 38 87 954 6 8800
## 39 53 961 99 6398
## 46 75 3128 213 5504
## 49 48 11249 2682 1828
## 50 57 21664 4894 1828
## 63 73 12911 1404 2033
## 77 93 887 1 17000
## 78 100 654 5 17230
## 79 73 750 30 17688
## 81 83 569 7 17238
## 90 81 1686 35 16560
## 92 87 704 63 9900
## 95 96 2488 625 18200
## 96 38 784 127 9520
## 97 80 2615 248 13332
## 108 48 4309 549 13226
## 110 71 13259 1429 17230
## 112 72 1276 9 14320
## 120 64 3558 436 13592
## 121 75 641 101 10800
## 122 44 1922 232 1386
## 123 87 1892 7 17142
## 126 35 3793 486 1450
## 127 18 3471 911 1434
## 130 34 3716 675 1644
## 134 24 2315 146 1819
## 139 24 6394 3881 1842
## 140 93 1630 232 18740
## 146 51 1707 1889 11070
## 148 52 543 712 12200
## 149 84 1737 244 18810
## 151 36 527 312 12500
## 152 26 3768 1415 14220
## 153 86 2714 27 19130
## 154 50 2766 1531 1842
## 156 63 12478 1660 3824
## 158 49 2159 211 13712
## 160 71 5471 1470 17450
## 161 93 5881 406 18300
## 164 49 309 639 11412
## 168 57 14130 4522 3690
## 169 25 872 448 9890
## 172 30 2128 82 5188
## out.of.state.tuition room board add.fees estim.book.costs
## 1 7560 1620 2500 130 800
## 3 5226 2514 2250 34 500
## 10 11660 2050 2430 120 400
## 12 8080 1380 2540 100 500
## 22 5666 1424 1540 418 1000
## 26 4440 1935 3240 291 750
## 32 8644 2382 1540 120 500
## 38 8800 1935 1260 325 500
## 39 6398 1450 2222 148 400
## 46 5504 1650 1878 1016 700
## 49 6746 1890 1838 66 620
## 50 7434 2186 2030 66 620
## 63 7380 2335 2542 20 612
## 77 17000 3160 2850 140 500
## 78 17230 3320 3370 465 700
## 79 17688 3192 2708 1548 650
## 81 17238 3500 3850 112 600
## 90 16560 2980 2160 334 558
## 92 9900 1500 2170 174 630
## 95 18200 3750 3020 70 500
## 96 9520 1980 2144 340 630
## 97 13332 3177 3177 120 630
## 108 13226 3846 2606 100 750
## 110 17230 3566 2916 330 600
## 112 14320 3080 2224 866 490
## 120 13592 3276 2640 176 545
## 121 10800 2520 1920 690 570
## 122 1672 1820 1910 374 520
## 123 17142 2114 2076 120 450
## 126 6198 1820 1500 327 500
## 127 5016 1782 2016 380 540
## 130 7100 1756 2624 324 540
## 134 5918 1936 1819 507 500
## 139 5962 2384 2060 1298 500
## 140 18740 2935 3365 210 600
## 146 11070 4090 1690 320 400
## 148 12200 2200 2400 200 650
## 149 18810 3540 2150 780 500
## 151 12500 3700 3110 624 500
## 152 14220 2300 3700 690 500
## 153 19130 3290 2310 660 1400
## 154 5962 2436 1880 1513 650
## 156 11656 2552 2520 888 700
## 158 13712 3622 2786 460 526
## 160 17450 3998 2330 720 700
## 161 18300 2590 4541 156 670
## 164 11412 2900 3530 150 500
## 168 10220 2280 1950 410 530
## 169 9890 2450 2224 405 500
## 172 5188 1541 1855 427 650
## estim.personal.dollar Per.fac.wvs.PHD stud.vs.fac.ratio Graduation.rate
## 1 1500 76 11.9 15
## 3 1162 39 9.5 39
## 10 900 74 14.0 72
## 12 1100 63 11.4 44
## 22 1400 56 15.5 46
## 26 2200 96 6.7 33
## 32 800 79 12.6 54
## 38 1200 82 13.1 63
## 39 1350 68 13.3 75
## 46 910 71 17.7 73
## 49 2342 78 21.7 41
## 50 2550 91 19.6 49
## 63 2091 72 19.8 59
## 77 850 99 9.6 87
## 78 900 100 8.2 100
## 79 850 100 10.4 73
## 81 800 95 8.2 73
## 90 1152 91 10.5 79
## 92 1818 59 10.5 54
## 95 700 95 11.6 66
## 96 1818 63 18.6 43
## 97 1584 88 16.1 78
## 108 2450 86 13.6 62
## 110 2210 90 11.4 68
## 112 1410 77 14.9 87
## 120 1328 84 14.2 84
## 121 1515 55 13.1 55
## 122 2200 63 27.9 60
## 123 1200 85 11.3 84
## 126 2500 89 19.1 46
## 127 2256 48 28.8 59
## 130 2948 63 19.4 36
## 134 2050 76 19.4 52
## 139 985 69 16.7 49
## 140 500 86 10.7 91
## 146 600 71 14.8 82
## 148 950 87 11.2 76
## 149 680 91 10.4 91
## 151 500 76 7.9 51
## 152 1440 61 10.7 66
## 153 1400 90 12.1 92
## 154 500 71 16.9 50
## 156 2300 89 16.0 71
## 158 1100 90 9.3 75
## 160 950 92 7.6 72
## 161 1700 91 7.2 95
## 164 900 89 8.3 96
## 168 1300 82 18.3 75
## 169 1350 52 14.4 84
## 172 2500 48 13.8 58
## Cluster
## 1 1
## 3 1
## 10 1
## 12 1
## 22 1
## 26 1
## 32 1
## 38 1
## 39 1
## 46 1
## 49 1
## 50 2
## 63 1
## 77 3
## 78 3
## 79 3
## 81 3
## 90 3
## 92 1
## 95 3
## 96 1
## 97 3
## 108 1
## 110 3
## 112 1
## 120 3
## 121 1
## 122 1
## 123 3
## 126 1
## 127 1
## 130 1
## 134 1
## 139 1
## 140 3
## 146 1
## 148 1
## 149 3
## 151 1
## 152 1
## 153 3
## 154 1
## 156 2
## 158 1
## 160 3
## 161 3
## 164 1
## 168 2
## 169 1
## 172 1
nrow(univ_cont)
## [1] 471
nrow(univ_complete)
## [1] 471
length(univ_complete_w_clusters$Cluster)
## [1] 471
length(univ_hc_m_memb)
## [1] 471
names(univ_complete_w_clusters)
## [1] "College.Name" "State"
## [3] "Public.vs.Private" "No.appli.rec"
## [5] "No.appl.accepted" "No.new.stud.enrolled"
## [7] "Per.new.stud.from.top.10" "Per.new.stud.from.top.25"
## [9] "No.FT.undergrad" "No.PT.undergrad"
## [11] "in.state.tuition" "out.of.state.tuition"
## [13] "room" "board"
## [15] "add.fees" "estim.book.costs"
## [17] "estim.personal.dollar" "Per.fac.wvs.PHD"
## [19] "stud.vs.fac.ratio" "Graduation.rate"
## [21] "Cluster"
# 4. Investigate cluster characteristics ----------------------------------
# Cluster characteristics, numerical variables
# may use aggregate()
aggregate(univ_cont,
by = list(Cluster = univ_complete_w_clusters$Cluster),
FUN = mean)
## Cluster No.appli.rec No.appl.accepted No.new.stud.enrolled
## 1 1 2027.571 1440.339 585.6202
## 2 2 14043.185 9537.704 3501.4815
## 3 3 4629.782 2397.051 754.2949
## Per.new.stud.from.top.10 Per.new.stud.from.top.25 No.FT.undergrad
## 1 21.82514 49.17213 2678.484
## 2 32.55556 64.70370 17186.370
## 3 55.47436 82.92308 2997.269
## No.PT.undergrad in.state.tuition out.of.state.tuition room board
## 1 706.3689 8076.637 9190.503 2062.150 2002.738
## 2 3678.0741 3871.074 9156.296 2244.407 2128.407
## 3 227.7179 17563.551 17563.551 2958.923 2679.038
## add.fees estim.book.costs estim.personal.dollar Per.fac.wvs.PHD
## 1 360.2951 536.8169 1345.164 68.63115
## 2 684.7407 565.7778 1755.630 86.44444
## 3 361.0641 599.0513 1002.462 90.10256
## stud.vs.fac.ratio Graduation.rate
## 1 14.60956 61.43443
## 2 16.37037 67.44444
## 3 10.09359 84.28205
# Cluster characteristics, categorical variables
# may use table()
table(univ_complete_w_clusters$Cluster, univ_complete_w_clusters$State)
##
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO
## 1 2 4 4 1 6 5 6 2 1 6 5 1 17 2 10 14 7 6 4 9 2 4 11 8 14
## 2 0 0 0 1 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 3 1 0 2 1 0
## 3 0 0 0 0 9 1 3 2 0 2 1 0 1 0 4 1 0 0 1 10 0 2 0 2 1
##
## MS MT NC ND NE NH NJ NM NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY
## 1 5 2 19 5 7 4 10 2 25 17 6 4 31 2 9 4 12 17 2 13 7 1 8 2 1
## 2 0 0 2 0 0 1 1 0 1 4 0 0 2 1 0 0 0 2 0 1 0 0 0 0 0
## 3 0 0 2 0 0 1 2 0 12 3 0 1 9 1 0 0 3 1 0 1 0 1 1 0 0
table(univ_complete_w_clusters$Cluster, univ_complete_w_clusters$Public.vs.Private)
##
## 1 2
## 1 104 262
## 2 24 3
## 3 0 78
This clustering method parts our data into three different types of universities, each with their own distinct characteristics.
Our first cluster includes smaller schools, mostly private with a mix of some public. The application counts and enrolling counts are lowest in this cluster, but the selection process isn’t as rigorous as cluster 3.
The second cluster includes mostly public schools, where the average number of applicants is 14,043. Tuition is lowest in this cluster, which aligns with the affordability of in-state public colleges. This cluster is still moderately selective, but still doesn’t have the selection requirements of cluster 3.
Cluster three includes exclusively private schools. Applicants are twice as common for this cluster compared to cluster 1, but the enrolled student count is only less than 200 above cluster 1. This rate, plus the 83% of students enrolled in the top 25% nationally, signify that these schools have the most competent and prepared students of all three clusters. Furthermore, graduation rate is 84% for this group, by far the highest of the three clusters.
These clusters make sense to me, as two clusters deal with high-end private schools and public schools respectively, with the third cluster encompassing the remaining lower-prestige private schools and smaller public schools.