Data description

In the data of The Times Higher Education World University Rankings 2021 we have information about 1448 universities from 92 countries (top-6 are US, Japan, UK, China, India, Brazil and the number of observed universities there very from 174 to 51). From the data we now how universirsities’ scores for their teaching, researches, citations, industry income and international outlook. Also, we have some statistical information: number of students (total and per staff), share of interantional students and female share. Descriptive statistics for these parameters can be seen below.

the2021 <-read.csv("THE2021.csv")
summary(the2021)

##                                          name                location  
##  <U+200B>Shahid Chamran University of Ahvaz:   1   United States :174  
##  Aalto University                          :   1   Japan         :111  
##  Aarhus University                         :   1   United Kingdom:101  
##  Abdul Wali Khan University Mardan         :   1   China         : 70  
##  Aberystwyth University                    :   1   India         : 62  
##  Acharya Nagarjuna University              :   1   Brazil        : 51  
##  (Other)                                   :1442   (Other)       :879  
##  scores_teaching scores_research scores_citations scores_industry_income
##  Min.   :11.70   Min.   : 7.10   Min.   :  1.90   Min.   : 33.30        
##  1st Qu.:18.50   1st Qu.:11.40   1st Qu.: 23.15   1st Qu.: 34.70        
##  Median :23.40   Median :17.15   Median : 44.80   Median : 38.40        
##  Mean   :27.53   Mean   :23.13   Mean   : 47.76   Mean   : 45.52        
##  3rd Qu.:31.93   3rd Qu.:28.82   3rd Qu.: 70.90   3rd Qu.: 48.40        
##  Max.   :94.40   Max.   :99.60   Max.   :100.00   Max.   :100.00        
##                                                                         
##  scores_international_outlook stats_number_students
##  Min.   : 13.50               Min.   :   557       
##  1st Qu.: 27.50               1st Qu.:  9746       
##  Median : 42.25               Median : 17273       
##  Mean   : 46.96               Mean   : 22241       
##  3rd Qu.: 63.02               3rd Qu.: 28753       
##  Max.   :100.00               Max.   :222102       
##                                                    
##  stats_student_staff_ratio stats_pc_intl_students stats_female_share
##  Min.   :  0.90            Min.   : 0.00          Min.   : 1.00     
##  1st Qu.: 12.20            1st Qu.: 2.00          1st Qu.:43.00     
##  Median : 16.20            Median : 7.00          Median :53.00     
##  Mean   : 18.51            Mean   :11.26          Mean   :49.94     
##  3rd Qu.: 22.02            3rd Qu.:17.00          3rd Qu.:58.00     
##  Max.   :109.10            Max.   :84.00          Max.   :98.00     
##

#unique(the2021$location) #92 countries
#summary(the2021$location)

descriptive statistics for the numeric variables:

# creation of a subset with only numeric variales (university scores and its general statistic)
library(psych)
library(dplyr)
#describeBy(the2021)
the2021_num <- the2021[, 3:11]
describeBy(the2021_num)

correlations between numeric variables:

# correlation
library(polycor)
df_cor <- hetcor(the2021_num)
as.data.frame(df_cor$correlations)

library(ggcorrplot)
df_corplot <- round(df_cor$correlations, 2)
ggcorrplot(df_corplot, type = "lower")

After correlational analysis we may see that university’s teaching and research scores are highly correlated (0.9). Maybe because high quality teaching staff produces pieces of research and inspire students do the same. Next two correlate variables are percentage of international students and university’s international outlook score (0.8), which is explainable, too: the more international students are accepted in the universiry the higher its rating on the global arena. Also there is some correlation between score variables, but it is rether weak (0.5). As for the negative correlations, in general the are almost no such, but a slight (almost insignificant) one may be noticed between female share and and industry income score (-0.3).

As for the missing values the are no such (great!) I have also standardized the numeric variables.

# missing values check
mean(is.na(the2021_num)) #0 - it's great!

# Standardization of variables:
the2021[, 3:11] <- the2021[, 3:11] %>%
  scale() %>% 
  as.data.frame()
lapply(the2021[ , 3:11], var) # all must be 1 - perfect, they are, trust me!
#summary(the2021)

PCA

PCA scores:

pca_scores <- prcomp(the2021[ , 3:11])
str(pca_scores, give.attr = F)

## List of 5
##  $ sdev    : num [1:9] 1.867 1.21 1.127 0.948 0.816 ...
##  $ rotation: num [1:9, 1:9] -0.438 -0.483 -0.395 -0.278 -0.422 ...
##  $ center  : Named num [1:9] -7.78e-17 -8.45e-17 -6.91e-17 5.70e-17 7.90e-17 ...
##  $ scale   : logi FALSE
##  $ x       : num [1:1448, 1:9] -7.23 -6.65 -6.1 -7.06 -7.09 ...

Variances (Eigenvalues):

(pca_scores$sdev ^ 2/length(pca_scores)) %>% round(2)

## [1] 0.70 0.29 0.25 0.18 0.13 0.11 0.09 0.03 0.01

Proportion of explained variance:

(pca_scores$sdev ^ 2/length(pca_scores$sdev)) %>% round(2)

## [1] 0.39 0.16 0.14 0.10 0.07 0.06 0.05 0.02 0.01

Contributions of variables:

contrib <- pca_scores$rotation[ , 1] %>% 
  abs() %>% 
  sort(decreasing = T) %>% 
  names()
pca_scores$rotation[contrib, 1] # show scores of influential variables

##              scores_research              scores_teaching 
##                 -0.483384203                 -0.438110496 
## scores_international_outlook       stats_pc_intl_students 
##                 -0.421568448                 -0.402648248 
##             scores_citations       scores_industry_income 
##                 -0.395024395                 -0.277802842 
##           stats_female_share        stats_number_students 
##                 -0.036193694                  0.004062871 
##    stats_student_staff_ratio 
##                  0.002855660

Proportion of variance explained

summary(pca_scores)

## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.8671 1.2097 1.1265 0.94784 0.81619 0.74787
## Proportion of Variance 0.3873 0.1626 0.1410 0.09982 0.07402 0.06215
## Cumulative Proportion  0.3873 0.5499 0.6909 0.79075 0.86476 0.92691
##                            PC7     PC8     PC9
## Standard deviation     0.66099 0.38634 0.26767
## Proportion of Variance 0.04855 0.01658 0.00796
## Cumulative Proportion  0.97545 0.99204 1.00000

Eigenvalues. They should be more than 1 and, as we may see, only the first three values fit this сondition.

 pca_scores$sdev^2

## [1] 3.48591462 1.46337701 1.26901618 0.89840475 0.66615817 0.55930826
## [7] 0.43691022 0.14926140 0.07164939

library(FactoMineR)
soc.pca <- PCA(the2021[, 3:11], graph = T)

# Calculation by using eigen on the covariance matrix

summary.PCA(soc.pca)

## 
## Call:
## PCA(X = the2021[, 3:11], graph = T) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6
## Variance               3.486   1.463   1.269   0.898   0.666   0.559
## % of var.             38.732  16.260  14.100   9.982   7.402   6.215
## Cumulative % of var.  38.732  54.992  69.092  79.075  86.476  92.691
##                        Dim.7   Dim.8   Dim.9
## Variance               0.437   0.149   0.072
## % of var.              4.855   1.658   0.796
## Cumulative % of var.  97.545  99.204 100.000
## 
## Individuals (the 10 first)
##                                  Dist    Dim.1    ctr   cos2    Dim.2
## 1                            |  7.693 |  7.235  1.037  0.885 | -1.366
## 2                            |  7.549 |  6.655  0.877  0.777 | -2.714
## 3                            |  7.201 |  6.102  0.738  0.718 | -1.186
## 4                            |  7.926 |  7.061  0.988  0.794 | -3.224
## 5                            |  7.715 |  7.088  0.995  0.844 | -2.658
## 6                            |  7.406 |  6.769  0.908  0.836 | -0.926
## 7                            |  6.983 |  6.032  0.721  0.746 | -1.631
## 8                            |  6.850 |  5.682  0.640  0.688 | -1.718
## 9                            |  6.781 |  5.893  0.688  0.755 | -1.709
## 10                           |  6.783 |  5.939  0.699  0.767 | -1.544
##                                 ctr   cos2    Dim.3    ctr   cos2  
## 1                             0.088  0.032 |  0.058  0.000  0.000 |
## 2                             0.348  0.129 |  0.594  0.019  0.006 |
## 3                             0.066  0.027 |  0.120  0.001  0.000 |
## 4                             0.490  0.165 | -0.066  0.000  0.000 |
## 5                             0.334  0.119 |  0.169  0.002  0.000 |
## 6                             0.040  0.016 | -0.207  0.002  0.001 |
## 7                             0.125  0.055 |  2.058  0.230  0.087 |
## 8                             0.139  0.063 | -0.066  0.000  0.000 |
## 9                             0.138  0.064 | -0.239  0.003  0.001 |
## 10                            0.113  0.052 | -0.368  0.007  0.003 |
## 
## Variables
##                                 Dim.1    ctr   cos2    Dim.2    ctr   cos2
## scores_teaching              |  0.818 19.194  0.669 | -0.275  5.160  0.076
## scores_research              |  0.903 23.366  0.815 | -0.181  2.238  0.033
## scores_citations             |  0.738 15.604  0.544 |  0.220  3.296  0.048
## scores_industry_income       |  0.519  7.717  0.269 | -0.557 21.216  0.310
## scores_international_outlook |  0.787 17.772  0.620 |  0.374  9.564  0.140
## stats_number_students        | -0.008  0.002  0.000 |  0.319  6.958  0.102
## stats_student_staff_ratio    | -0.005  0.001  0.000 |  0.349  8.318  0.122
## stats_pc_intl_students       |  0.752 16.213  0.565 |  0.231  3.640  0.053
## stats_female_share           |  0.068  0.131  0.005 |  0.761 39.610  0.580
##                                 Dim.3    ctr   cos2  
## scores_teaching              |  0.104  0.859  0.011 |
## scores_research              |  0.190  2.851  0.036 |
## scores_citations             | -0.002  0.000  0.000 |
## scores_industry_income       |  0.305  7.312  0.093 |
## scores_international_outlook | -0.205  3.319  0.042 |
## stats_number_students        |  0.736 42.715  0.542 |
## stats_student_staff_ratio    |  0.659 34.265  0.435 |
## stats_pc_intl_students       | -0.314  7.752  0.098 |
## stats_female_share           | -0.108  0.927  0.012 |

#gr <- plot(soc.pca)
#gr + theme(panel.grid.major = element_blank(),
#   plot.title=element_text(size=14, color="blue"),
#   axis.title = element_text(size=12, color="red"))

# scree plot
library("factoextra")
fviz_screeplot(soc.pca, addlabels = TRUE, ylim = c(0, 65))

On the scree plot we may see that there are 9 components, the first one explains 38.7% variance of all the variables. The second and the third components explain 16% and 14% and further components explain 10% and less. So basically 2-3 principle components are okay to explain the data we have. Looking at the eigenvalues prooves this conclusion.

The same conclusion may be reached from the table of correlations between original variables and components, which is show below. There we can see that there are bigger correlations for the first and the second principle components. The first one summarize scores for teahing, research, citations, international outlook and percentage of international students; the second - industry income score and female share. Finally, the third component collects students number and its ration to staff.

#  (correlations between original variables and components)
as.data.frame(round(pca_scores$rotation[, 1:4], 2))

Answering the question “Is it possible to produce an acceptable PCA solution on these data” I would say - YES. First of all, there are some correlations between variables, which basically “feeds” PCA (if there were no correlations between variables there would be no reason to summarize data in order to narrow it). Then, the reasulted principle components explain a good amount of data variance - the first two components explain more than a half variance of all the variables (55%), which is a good indicator.

Below you may see more bright and clear 2d and 2d visualizations of PCA. I have also tried to show PCA grouped by countries, but as there are 92 of such, so the picture is not really clear. (but I had to try that!)

library(pca3d)
pca2d(pca_scores) #2d visualization (2 principle components)

pca3d(pca_scores) #3d visualization (3 principle components)

## [1] 0.14465857 0.09161782 0.15355832
## Creating new device

pca3d(pca_scores, group = the2021[,2]) #grouped by countries, 3 components

## [1] 0.14465857 0.09161782 0.15355832

The US and Canada VS the world

the2021$country_bin[as.character(the2021$location) == "United States" | 
                      as.character(the2021$location) == "Canada"] <- "US_C"
the2021$country_bin[as.character(the2021$location) != "United States" & 
                      as.character(the2021$location) != "Canada"] <- "Other"
the2021$country_bin <- as.factor(the2021$country_bin)
summary(the2021$country_bin)

## Other  US_C 
##  1247   201

First, I have divided the “location variable”, as I have been asked: the US and Canada VS the other countries. In total there are 1247 observations except from those, whoch are from Canada or US. Then, I have created a plot, which shows this division. From the graph I can say that universities from US and Canada do not show higher scores in teaching or scientific work than other.

pca2d(pca_scores, group = the2021[,12], legend = "bottomleft")

pca3d(pca_scores, group = the2021[,12], legend = "bottomleft")

## [1] 0.14465857 0.09161782 0.15355832

the2021$pc <- pca_scores$x
#summary(the2021)
t.test(the2021$pc ~ the2021$country_bin)

## 
##  Welch Two Sample t-test
## 
## data:  the2021$pc by the2021$country_bin
## t = 2.7844, df = 2383.9, p-value = 0.005404
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.02138085 0.12320978
## sample estimates:
## mean in group Other  mean in group US_C 
##          0.01003547         -0.06225985

T-test prooves my conclusion: p-value is significant (<.05), though the difference is small (0.07) - mean principle component for teaching, research and interantional scores for US and Candian universities is lower, than for the otehr universities.

PC1 <- the2021$pc[, 1]
PC2 <- the2021$pc[, 2]

ggplot solution (oof, so many labels, which are impossible to read…)

ggplot(the2021,aes(x=PC1,y=PC2))+
  theme_classic()+
  geom_hline(yintercept = 0,color="gray70")+
  geom_vline(xintercept = 0,color="gray70")+
  geom_point(aes(color=country_bin),alpha=0.55,size=3)+
  xlab("PC1")+
  ylab("PC2")+
  #xlim(-5,6)+
  ggtitle("US and Canada VS other countries uniersities' rating\nPCA on THE2021 data") +
  geom_text(aes(y=PC2+0.25,label=rownames(the2021)))

plotly solution - very nice! You can get all the needed information by hovering over the point

library(plotly)

p <- plot_ly(the2021,x=PC1, y=PC2,text=rownames(the2021),
             mode="markers", color = the2021$country_bin, marker=list(size=11))
layout(p,title="US and Canada VS other countries uniersities' rating\nPCA on THE2021 data",
            xaxis=list(title="PC1"),
            yaxis=list(title="PC2"))

PCA_HW

Anna Podolskaya

08/11/2020

Data description

PCA

The US and Canada VS the world