In this project, I time Times Higher Education’s World University Rankings 2020 from kaggle (https://www.kaggle.com/joeshamen/world-university-rankings-2020) to implement dimension reduction. The project includes 5 parts: - Data processing - Analysis of correlation - Optimal number of clusters - Data clustering - Assessing clustering quality - Analysis of clusters.
uniRank <- read.csv('Word_University_Rank_2020.csv',stringsAsFactors = F)
uniRank$Overall_Ranking <- NULL
uniRank1 <- uniRank[,]
uniRank1$Numb_students_per_Staff <- as.numeric(uniRank1$Numb_students_per_Staff)
uniRank1$Number_students <- as.integer(gsub(',','',uniRank1$Number_students))
uniRank1$International_Students<- as.numeric(gsub('%','',uniRank1$International_Students))/100
uniRank1$International_Students <- ifelse(is.na(uniRank1$International_Students), 0, uniRank1$International_Students)
uniRank1$Percentage_Female<- as.numeric(gsub('%','',uniRank1$Percentage_Female))/100
uniRank1$Percentage_Male<- as.numeric(gsub('%','',uniRank1$Percentage_Male))/100
uniRank1$Rank_Char <- NULL
uniName <- uniRank1[,2]
uniCountry <- uniRank1[,3]
uniRankScore <- uniRank1[,1]
uniData <- uniRank1[-c(1:3,ncol(uniRank1))]
summary(uniRank1)
## Score_Rank University Country Number_students
## Min. : 1.0 Length:1396 Length:1396 Min. : 558
## 1st Qu.:212.0 Class :character Class :character 1st Qu.: 10262
## Median :336.0 Mode :character Mode :character Median : 17832
## Mean :315.3 Mean : 23723
## 3rd Qu.:437.0 3rd Qu.: 29434
## Max. :535.0 Max. :830104
## Numb_students_per_Staff International_Students Percentage_Female
## Min. : 0.90 Min. :0.0000 Min. :0.0000
## 1st Qu.: 12.38 1st Qu.:0.0200 1st Qu.:0.4100
## Median : 16.35 Median :0.0800 Median :0.5200
## Mean : 18.97 Mean :0.1135 Mean :0.4753
## 3rd Qu.: 21.90 3rd Qu.:0.1700 3rd Qu.:0.5700
## Max. :493.50 Max. :0.8300 Max. :1.0000
## Percentage_Male Teaching Research Citations
## Min. :0.0000 Min. :11.20 Min. : 6.80 Min. : 1.70
## 1st Qu.:0.4100 1st Qu.:18.30 1st Qu.:11.60 1st Qu.: 23.38
## Median :0.4700 Median :23.80 Median :18.00 Median : 45.65
## Mean :0.4774 Mean :28.23 Mean :23.98 Mean : 48.11
## 3rd Qu.:0.5600 3rd Qu.:33.60 3rd Qu.:30.10 3rd Qu.: 71.95
## Max. :1.0000 Max. :92.80 Max. :99.60 Max. :100.00
## Industry_Income International_Outlook Score_Result
## Min. : 34.40 Min. :13.10 Min. :10.70
## 1st Qu.: 35.77 1st Qu.:27.48 1st Qu.:21.00
## Median : 39.40 Median :43.10 Median :31.60
## Mean : 46.48 Mean :47.11 Mean :34.79
## 3rd Qu.: 49.83 3rd Qu.:62.80 3rd Qu.:44.50
## Max. :100.00 Max. :99.70 Max. :95.40
summary(uniRank)
## Rank_Char Score_Rank University Country
## Length:1396 Min. : 1.0 Length:1396 Length:1396
## Class :character 1st Qu.:212.0 Class :character Class :character
## Mode :character Median :336.0 Mode :character Mode :character
## Mean :315.3
## 3rd Qu.:437.0
## Max. :535.0
## Number_students Numb_students_per_Staff International_Students
## Length:1396 Min. : 0.90 Length:1396
## Class :character 1st Qu.: 12.38 Class :character
## Mode :character Median : 16.35 Mode :character
## Mean : 18.97
## 3rd Qu.: 21.90
## Max. :493.50
## Percentage_Female Percentage_Male Teaching Research
## Length:1396 Length:1396 Min. :11.20 Min. : 6.80
## Class :character Class :character 1st Qu.:18.30 1st Qu.:11.60
## Mode :character Mode :character Median :23.80 Median :18.00
## Mean :28.23 Mean :23.98
## 3rd Qu.:33.60 3rd Qu.:30.10
## Max. :92.80 Max. :99.60
## Citations Industry_Income International_Outlook Score_Result
## Min. : 1.70 Min. : 34.40 Min. :13.10 Min. :10.70
## 1st Qu.: 23.38 1st Qu.: 35.77 1st Qu.:27.48 1st Qu.:21.00
## Median : 45.65 Median : 39.40 Median :43.10 Median :31.60
## Mean : 48.11 Mean : 46.48 Mean :47.11 Mean :34.79
## 3rd Qu.: 71.95 3rd Qu.: 49.83 3rd Qu.:62.80 3rd Qu.:44.50
## Max. :100.00 Max. :100.00 Max. :99.70 Max. :95.40
uniRank$Overall_Ranking <- NULL
uniRank1 <- uniRank[,]
uniRank1$Numb_students_per_Staff <- as.numeric(uniRank1$Numb_students_per_Staff)
uniRank1$Number_students <- as.integer(gsub(',','',uniRank1$Number_students))
uniRank1$International_Students<- as.numeric(gsub('%','',uniRank1$International_Students))/100
uniRank1$International_Students <- ifelse(is.na(uniRank1$International_Students), 0, uniRank1$International_Students)
uniRank1$Percentage_Female<- as.numeric(gsub('%','',uniRank1$Percentage_Female))/100
uniRank1$Percentage_Male<- as.numeric(gsub('%','',uniRank1$Percentage_Male))/100
uniRank1$Rank_Char <- NULL
uniName <- uniRank1[,2]
uniCountry <- uniRank1[,3]
uniRankScore <- uniRank1[,1]
uniData.n<-as.matrix(as.data.frame(lapply(uniData, scale)))
uniData.n.cor<-cor(uniData.n, method="pearson")
corrplot(uniData.n.cor, order ="alphabet", tl.cex=0.6, na.label = "NA")
According to the correlation plot, we can see some high positive correlation between dimensions such as research and citation, teaching and research. Therefore, we can use dimension reduction with the dataset.
distUni <- dist(uniData)
mds1<-cmdscale(distUni, k=2)
summary(mds1)
## V1 V2
## Min. :-23165 Min. :-110.162
## 1st Qu.:-13461 1st Qu.: -24.975
## Median : -5890 Median : 5.128
## Mean : 0 Mean : 0.000
## 3rd Qu.: 5711 3rd Qu.: 31.194
## Max. :806381 Max. : 58.322
plot(mds1)
text(mds1, labels=uniName, cex=0.6, adj=0.5)
x.out<-which(mds1[,1]>4e+05)
y.out<-which(mds1[,2]>100 | mds1[,2]< -150)
out.all<-unique(c(y.out, x.out)) # merging into single dataset
uniRank1[out.all,]
## Score_Rank University Country Number_students
## 1085 447 Anadolu University Turkey 830104
## Numb_students_per_Staff International_Students Percentage_Female
## 1085 493.5 0.01 0.37
## Percentage_Male Teaching Research Citations Industry_Income
## 1085 0.63 13 24.1 16.5 100
## International_Outlook Score_Result
## 1085 17.3 19.9
Via MDS visualization, we can see outliers in 2-d graph and drop them in further analysis.
uniData.cov<-cov(uniData.n)
uniData.eigen <-eigen(uniData.cov)
uniData.eigen$values
## [1] 3.48590400 1.66430437 1.54855008 0.91350126 0.79788309 0.58987928
## [7] 0.46198439 0.32470142 0.14504825 0.06824386
head(uniData.eigen$vectors)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.03478252 -0.697090240 0.01839800 0.02901168 -0.02731372 0.18815822
## [2,] 0.01836167 -0.698484595 0.04731838 -0.12498577 0.10052664 -0.04770570
## [3,] -0.40226337 0.063142795 0.24866956 -0.29945600 0.43539520 -0.08822456
## [4,] 0.01738866 -0.036955158 0.57115510 -0.17569414 -0.64905368 -0.46007800
## [5,] 0.06264615 0.046667230 -0.38169509 -0.87573491 -0.22116204 0.16639321
## [6,] -0.43993135 0.008192937 -0.22678733 0.16786569 -0.35122752 0.15593505
## [,7] [,8] [,9] [,10]
## [1,] 0.28049035 0.62666217 -0.040455489 0.050086839
## [2,] -0.20941373 -0.65362083 -0.009223651 -0.101886407
## [3,] 0.29001967 -0.03626203 -0.616531081 0.140702706
## [4,] 0.05651668 0.02813176 -0.062162794 0.021287920
## [5,] -0.01146443 0.04888161 0.046590920 -0.001986479
## [6,] 0.39046120 -0.16135136 -0.104768876 -0.624207227
uniPCA1 <- prcomp(uniData.n, center=FALSE, scale.=FALSE)
summary(uniPCA1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.8671 1.2901 1.2444 0.95577 0.89324 0.76804 0.6797
## Proportion of Variance 0.3486 0.1664 0.1549 0.09135 0.07979 0.05899 0.0462
## Cumulative Proportion 0.3486 0.5150 0.6699 0.76123 0.84101 0.90000 0.9462
## PC8 PC9 PC10
## Standard deviation 0.56983 0.3809 0.26124
## Proportion of Variance 0.03247 0.0145 0.00682
## Cumulative Proportion 0.97867 0.9932 1.00000
In the loading table, 5 principal components can account for 84% of total variance. However, the first 2 PCs only take 52% of variance.
### Plot cumulative variance
plot(summary(uniPCA1)$importance[3,],type = 'l')
var<-get_pca_var(uniPCA1)
a<-fviz_contrib(uniPCA1, "var", axes=1, xtickslab.rt=90)
b<-fviz_contrib(uniPCA1, "var", axes=2, xtickslab.rt=90)
grid.arrange(a,b,ncol = 2, top='Contribution to the first two Principal Components')
We can see in the first 2 PCs, Reasearch, Teaching, International Outlook, citations, number of students per staff and number of student contribute mostly to these PCs. These variables can explain the difference between universities in the ranking list.
uniPCA2<-principal(uniData.n, nfactors=4, rotate="varimax")
summary(uniPCA2)
##
## Factor analysis with Call: principal(r = uniData.n, nfactors = 4, rotate = "varimax")
##
## Test of the hypothesis that 4 factors are sufficient.
## The degrees of freedom for the model is 11 and the objective function was 1.74
## The number of observations was 1396 with Chi Square = 2414.38 with prob < 0
##
## The root mean square of the residuals (RMSA) is 0.08
print(loadings(uniPCA2), digits=4, cutoff=0.4, sort=TRUE)
##
## Loadings:
## RC1 RC3 RC2 RC4
## International_Students 0.8576
## Research 0.6983 0.6148
## Citations 0.7412
## International_Outlook 0.9085
## Percentage_Female -0.6655
## Teaching 0.5948 0.6375
## Industry_Income 0.7559
## Number_students 0.8967
## Numb_students_per_Staff 0.9097
## Percentage_Male 0.9604
##
## RC1 RC3 RC2 RC4
## SS loadings 3.0747 1.8548 1.6620 1.0208
## Proportion Var 0.3075 0.1855 0.1662 0.1021
## Cumulative Var 0.3075 0.4929 0.6591 0.7612
scree1 <- fviz_eig(uniPCA1, choice='eigenvalue',) # eigenvalues on y-axis
scree2 <- fviz_eig(uniPCA1)
grid.arrange(scree1,scree2,ncol = 2 ,top='Contribution to the first two Principal Components')
On the Scree Plot, the 5th PC accounts for nearly 10% of explained variances. This means the datasets have more than 5 PCs to nearly describe its variance clearly.
uniPCA2$complexity
## Number_students Numb_students_per_Staff International_Students
## 1.025463 1.008019 1.035182
## Percentage_Female Percentage_Male Teaching
## 1.426568 1.045838 2.094714
## Research Citations Industry_Income
## 2.025968 1.237223 1.222121
## International_Outlook
## 1.020325
uniPCA2$uniquenesses
## Number_students Numb_students_per_Staff International_Students
## 0.18574622 0.16910524 0.25161578
## Percentage_Female Percentage_Male Teaching
## 0.46330957 0.05651008 0.21984060
## Research Citations Industry_Income
## 0.12164650 0.38693542 0.36672955
## International_Outlook
## 0.16630132
autoplot(uniPCA1, loadings=TRUE, loadings.colour='blue', loadings.label=TRUE,loadings.label.size=3)
uniKM <- eclust(uniData.n, k=3, graph = F)
autoplot(uniPCA1, data=uniKM, colour="cluster")