1. Project introduction

In this project, I time Times Higher Education’s World University Rankings 2020 from kaggle (https://www.kaggle.com/joeshamen/world-university-rankings-2020) to implement dimension reduction. The project includes 5 parts: - Data processing - Analysis of correlation - Optimal number of clusters - Data clustering - Assessing clustering quality - Analysis of clusters.

2. Process the dataset

uniRank <- read.csv('Word_University_Rank_2020.csv',stringsAsFactors = F)
uniRank$Overall_Ranking <- NULL
uniRank1 <- uniRank[,]
uniRank1$Numb_students_per_Staff <- as.numeric(uniRank1$Numb_students_per_Staff)
uniRank1$Number_students <- as.integer(gsub(',','',uniRank1$Number_students))
uniRank1$International_Students<- as.numeric(gsub('%','',uniRank1$International_Students))/100
uniRank1$International_Students <- ifelse(is.na(uniRank1$International_Students), 0, uniRank1$International_Students)
uniRank1$Percentage_Female<- as.numeric(gsub('%','',uniRank1$Percentage_Female))/100
uniRank1$Percentage_Male<- as.numeric(gsub('%','',uniRank1$Percentage_Male))/100
uniRank1$Rank_Char <- NULL
uniName <- uniRank1[,2]
uniCountry <- uniRank1[,3]
uniRankScore <- uniRank1[,1]
uniData <- uniRank1[-c(1:3,ncol(uniRank1))]
summary(uniRank1)
##    Score_Rank     University          Country          Number_students 
##  Min.   :  1.0   Length:1396        Length:1396        Min.   :   558  
##  1st Qu.:212.0   Class :character   Class :character   1st Qu.: 10262  
##  Median :336.0   Mode  :character   Mode  :character   Median : 17832  
##  Mean   :315.3                                         Mean   : 23723  
##  3rd Qu.:437.0                                         3rd Qu.: 29434  
##  Max.   :535.0                                         Max.   :830104  
##  Numb_students_per_Staff International_Students Percentage_Female
##  Min.   :  0.90          Min.   :0.0000         Min.   :0.0000   
##  1st Qu.: 12.38          1st Qu.:0.0200         1st Qu.:0.4100   
##  Median : 16.35          Median :0.0800         Median :0.5200   
##  Mean   : 18.97          Mean   :0.1135         Mean   :0.4753   
##  3rd Qu.: 21.90          3rd Qu.:0.1700         3rd Qu.:0.5700   
##  Max.   :493.50          Max.   :0.8300         Max.   :1.0000   
##  Percentage_Male     Teaching        Research       Citations     
##  Min.   :0.0000   Min.   :11.20   Min.   : 6.80   Min.   :  1.70  
##  1st Qu.:0.4100   1st Qu.:18.30   1st Qu.:11.60   1st Qu.: 23.38  
##  Median :0.4700   Median :23.80   Median :18.00   Median : 45.65  
##  Mean   :0.4774   Mean   :28.23   Mean   :23.98   Mean   : 48.11  
##  3rd Qu.:0.5600   3rd Qu.:33.60   3rd Qu.:30.10   3rd Qu.: 71.95  
##  Max.   :1.0000   Max.   :92.80   Max.   :99.60   Max.   :100.00  
##  Industry_Income  International_Outlook  Score_Result  
##  Min.   : 34.40   Min.   :13.10         Min.   :10.70  
##  1st Qu.: 35.77   1st Qu.:27.48         1st Qu.:21.00  
##  Median : 39.40   Median :43.10         Median :31.60  
##  Mean   : 46.48   Mean   :47.11         Mean   :34.79  
##  3rd Qu.: 49.83   3rd Qu.:62.80         3rd Qu.:44.50  
##  Max.   :100.00   Max.   :99.70         Max.   :95.40
summary(uniRank)
##   Rank_Char           Score_Rank     University          Country         
##  Length:1396        Min.   :  1.0   Length:1396        Length:1396       
##  Class :character   1st Qu.:212.0   Class :character   Class :character  
##  Mode  :character   Median :336.0   Mode  :character   Mode  :character  
##                     Mean   :315.3                                        
##                     3rd Qu.:437.0                                        
##                     Max.   :535.0                                        
##  Number_students    Numb_students_per_Staff International_Students
##  Length:1396        Min.   :  0.90          Length:1396           
##  Class :character   1st Qu.: 12.38          Class :character      
##  Mode  :character   Median : 16.35          Mode  :character      
##                     Mean   : 18.97                                
##                     3rd Qu.: 21.90                                
##                     Max.   :493.50                                
##  Percentage_Female  Percentage_Male       Teaching        Research    
##  Length:1396        Length:1396        Min.   :11.20   Min.   : 6.80  
##  Class :character   Class :character   1st Qu.:18.30   1st Qu.:11.60  
##  Mode  :character   Mode  :character   Median :23.80   Median :18.00  
##                                        Mean   :28.23   Mean   :23.98  
##                                        3rd Qu.:33.60   3rd Qu.:30.10  
##                                        Max.   :92.80   Max.   :99.60  
##    Citations      Industry_Income  International_Outlook  Score_Result  
##  Min.   :  1.70   Min.   : 34.40   Min.   :13.10         Min.   :10.70  
##  1st Qu.: 23.38   1st Qu.: 35.77   1st Qu.:27.48         1st Qu.:21.00  
##  Median : 45.65   Median : 39.40   Median :43.10         Median :31.60  
##  Mean   : 48.11   Mean   : 46.48   Mean   :47.11         Mean   :34.79  
##  3rd Qu.: 71.95   3rd Qu.: 49.83   3rd Qu.:62.80         3rd Qu.:44.50  
##  Max.   :100.00   Max.   :100.00   Max.   :99.70         Max.   :95.40
uniRank$Overall_Ranking <- NULL
uniRank1 <- uniRank[,]
uniRank1$Numb_students_per_Staff <- as.numeric(uniRank1$Numb_students_per_Staff)
uniRank1$Number_students <- as.integer(gsub(',','',uniRank1$Number_students))
uniRank1$International_Students<- as.numeric(gsub('%','',uniRank1$International_Students))/100
uniRank1$International_Students <- ifelse(is.na(uniRank1$International_Students), 0, uniRank1$International_Students)
uniRank1$Percentage_Female<- as.numeric(gsub('%','',uniRank1$Percentage_Female))/100
uniRank1$Percentage_Male<- as.numeric(gsub('%','',uniRank1$Percentage_Male))/100
uniRank1$Rank_Char <- NULL
uniName <- uniRank1[,2]
uniCountry <- uniRank1[,3]
uniRankScore <- uniRank1[,1]

Standardize data

uniData.n<-as.matrix(as.data.frame(lapply(uniData, scale)))

3. Analysis of correlation

uniData.n.cor<-cor(uniData.n, method="pearson")
corrplot(uniData.n.cor, order ="alphabet", tl.cex=0.6, na.label = "NA")

According to the correlation plot, we can see some high positive correlation between dimensions such as research and citation, teaching and research. Therefore, we can use dimension reduction with the dataset.

4. Multidimensional scaling (MDS) of a data matrix

Visualize data in 2 dimensions

distUni <- dist(uniData)
mds1<-cmdscale(distUni, k=2)
summary(mds1)
##        V1               V2          
##  Min.   :-23165   Min.   :-110.162  
##  1st Qu.:-13461   1st Qu.: -24.975  
##  Median : -5890   Median :   5.128  
##  Mean   :     0   Mean   :   0.000  
##  3rd Qu.:  5711   3rd Qu.:  31.194  
##  Max.   :806381   Max.   :  58.322
plot(mds1)
text(mds1, labels=uniName, cex=0.6, adj=0.5)

Outliers

x.out<-which(mds1[,1]>4e+05) 
y.out<-which(mds1[,2]>100 | mds1[,2]< -150) 
out.all<-unique(c(y.out, x.out)) # merging into single dataset
uniRank1[out.all,]
##      Score_Rank         University Country Number_students
## 1085        447 Anadolu University  Turkey          830104
##      Numb_students_per_Staff International_Students Percentage_Female
## 1085                   493.5                   0.01              0.37
##      Percentage_Male Teaching Research Citations Industry_Income
## 1085            0.63       13     24.1      16.5             100
##      International_Outlook Score_Result
## 1085                  17.3         19.9

Via MDS visualization, we can see outliers in 2-d graph and drop them in further analysis.

5. Principal Component Analysis

Eigenvalues on the basis of covariance

uniData.cov<-cov(uniData.n)
uniData.eigen <-eigen(uniData.cov)
uniData.eigen$values
##  [1] 3.48590400 1.66430437 1.54855008 0.91350126 0.79788309 0.58987928
##  [7] 0.46198439 0.32470142 0.14504825 0.06824386
head(uniData.eigen$vectors)
##             [,1]         [,2]        [,3]        [,4]        [,5]        [,6]
## [1,]  0.03478252 -0.697090240  0.01839800  0.02901168 -0.02731372  0.18815822
## [2,]  0.01836167 -0.698484595  0.04731838 -0.12498577  0.10052664 -0.04770570
## [3,] -0.40226337  0.063142795  0.24866956 -0.29945600  0.43539520 -0.08822456
## [4,]  0.01738866 -0.036955158  0.57115510 -0.17569414 -0.64905368 -0.46007800
## [5,]  0.06264615  0.046667230 -0.38169509 -0.87573491 -0.22116204  0.16639321
## [6,] -0.43993135  0.008192937 -0.22678733  0.16786569 -0.35122752  0.15593505
##             [,7]        [,8]         [,9]        [,10]
## [1,]  0.28049035  0.62666217 -0.040455489  0.050086839
## [2,] -0.20941373 -0.65362083 -0.009223651 -0.101886407
## [3,]  0.29001967 -0.03626203 -0.616531081  0.140702706
## [4,]  0.05651668  0.02813176 -0.062162794  0.021287920
## [5,] -0.01146443  0.04888161  0.046590920 -0.001986479
## [6,]  0.39046120 -0.16135136 -0.104768876 -0.624207227

PCA loadings

uniPCA1 <- prcomp(uniData.n, center=FALSE, scale.=FALSE)
summary(uniPCA1)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6    PC7
## Standard deviation     1.8671 1.2901 1.2444 0.95577 0.89324 0.76804 0.6797
## Proportion of Variance 0.3486 0.1664 0.1549 0.09135 0.07979 0.05899 0.0462
## Cumulative Proportion  0.3486 0.5150 0.6699 0.76123 0.84101 0.90000 0.9462
##                            PC8    PC9    PC10
## Standard deviation     0.56983 0.3809 0.26124
## Proportion of Variance 0.03247 0.0145 0.00682
## Cumulative Proportion  0.97867 0.9932 1.00000


In the loading table, 5 principal components can account for 84% of total variance. However, the first 2 PCs only take 52% of variance.
### Plot cumulative variance

plot(summary(uniPCA1)$importance[3,],type = 'l')

Contributions of individual variables to PC

var<-get_pca_var(uniPCA1)
a<-fviz_contrib(uniPCA1, "var", axes=1, xtickslab.rt=90) 
b<-fviz_contrib(uniPCA1, "var", axes=2, xtickslab.rt=90)
grid.arrange(a,b,ncol = 2, top='Contribution to the first two Principal Components')


We can see in the first 2 PCs, Reasearch, Teaching, International Outlook, citations, number of students per staff and number of student contribute mostly to these PCs. These variables can explain the difference between universities in the ranking list.

Rotated PCA

uniPCA2<-principal(uniData.n, nfactors=4, rotate="varimax")
summary(uniPCA2)
## 
## Factor analysis with Call: principal(r = uniData.n, nfactors = 4, rotate = "varimax")
## 
## Test of the hypothesis that 4 factors are sufficient.
## The degrees of freedom for the model is 11  and the objective function was  1.74 
## The number of observations was  1396  with Chi Square =  2414.38  with prob <  0 
## 
## The root mean square of the residuals (RMSA) is  0.08
print(loadings(uniPCA2), digits=4, cutoff=0.4, sort=TRUE)
## 
## Loadings:
##                         RC1     RC3     RC2     RC4    
## International_Students   0.8576                        
## Research                 0.6983  0.6148                
## Citations                0.7412                        
## International_Outlook    0.9085                        
## Percentage_Female               -0.6655                
## Teaching                 0.5948  0.6375                
## Industry_Income                  0.7559                
## Number_students                          0.8967        
## Numb_students_per_Staff                  0.9097        
## Percentage_Male                                  0.9604
## 
##                   RC1    RC3    RC2    RC4
## SS loadings    3.0747 1.8548 1.6620 1.0208
## Proportion Var 0.3075 0.1855 0.1662 0.1021
## Cumulative Var 0.3075 0.4929 0.6591 0.7612

Quality measures of PCA

Scree plots

scree1 <- fviz_eig(uniPCA1, choice='eigenvalue',) # eigenvalues on y-axis
scree2 <- fviz_eig(uniPCA1)
grid.arrange(scree1,scree2,ncol = 2 ,top='Contribution to the first two Principal Components')


On the Scree Plot, the 5th PC accounts for nearly 10% of explained variances. This means the datasets have more than 5 PCs to nearly describe its variance clearly.

Complexity and uniqueness

uniPCA2$complexity
##         Number_students Numb_students_per_Staff  International_Students 
##                1.025463                1.008019                1.035182 
##       Percentage_Female         Percentage_Male                Teaching 
##                1.426568                1.045838                2.094714 
##                Research               Citations         Industry_Income 
##                2.025968                1.237223                1.222121 
##   International_Outlook 
##                1.020325
uniPCA2$uniquenesses
##         Number_students Numb_students_per_Staff  International_Students 
##              0.18574622              0.16910524              0.25161578 
##       Percentage_Female         Percentage_Male                Teaching 
##              0.46330957              0.05651008              0.21984060 
##                Research               Citations         Industry_Income 
##              0.12164650              0.38693542              0.36672955 
##   International_Outlook 
##              0.16630132

Visualization of PCA

autoplot(uniPCA1, loadings=TRUE, loadings.colour='blue', loadings.label=TRUE,loadings.label.size=3)

uniKM <- eclust(uniData.n, k=3, graph = F)
autoplot(uniPCA1, data=uniKM, colour="cluster")