Preparation

Libraries & Data

library(readr)
library(tidyverse)
library(ggplot2)

#special correlation counting:
library(Hmisc) 
library(scales)
library(stringr)

library(flextable)
library(data.table)
library(kableExtra)
library(cowplot)
library(broom)
library(FactoMineR)
library(ggrepel)
library(plotly)

THE2021 <- read_csv("THE2021.csv")

In this home task, THE 2021 data is used to practice the principal component analysis (PCA). This is a popular dimensionality reduction technique that can be helpful when we want to:

  • deal with the multicollinearity;
  • come up with some indexes;
  • generally understand the variables’ inter-relations.

Besides the excluded actual ranking scores, the data contains university names, location, and 9 numeric indicators which constitute the ranking - these 9 variables are briefly presented in the table below. Countries & universities would be used to annotate the PCA plotting results.

THE2021 <- THE2021 %>% 
  rename(teaching = scores_teaching,
         research = scores_research,
         industry_income = scores_industry_income,
         international = scores_international_outlook,
         n_students = stats_number_students,
         students_stuff = stats_student_staff_ratio,
         pc_int_students = stats_pc_intl_students,
         female_share = stats_female_share)

variable <- THE2021[3:11] %>% colnames() 
mean <- c(round(mean(THE2021$teaching), 2),
          round(mean(THE2021$research), 2),
          round(mean(THE2021$scores_citations), 2),
          round(mean(THE2021$industry_income), 2),
          round(mean(THE2021$international), 2),
          round(mean(THE2021$n_students), 2),
          round(mean(THE2021$students_stuff), 2),
          round(mean(THE2021$pc_int_students), 2),
          round(mean(THE2021$female_share), 2))
max <- c(max(THE2021$teaching),
         max(THE2021$research),
         max(THE2021$scores_citations),
         max(THE2021$industry_income),
         max(THE2021$international),
         max(THE2021$n_students),
         max(THE2021$students_stuff),
         max(THE2021$pc_int_students),
         max(THE2021$female_share))
min <- c(min(THE2021$teaching),
         min(THE2021$research),
         min(THE2021$scores_citations),
         min(THE2021$industry_income),
         min(THE2021$international),
         min(THE2021$n_students),
         min(THE2021$students_stuff),
         min(THE2021$pc_int_students),
         min(THE2021$female_share))

caption_min <- c("Aksaray University, Turkey",
                 "2 Iraqi universities",
                 "Saveetha University, India",
                 "17 universities",
                 "Siksha ‘O’ Anusandhan, India",
                 "Scuola Normale Superiore di Pisa, Italy",
                 "3 Japanese universities",
                 "111 universities",
                 "King Fahd University of Petroleum and Minerals, Saudi Arabia")

caption_max <- c("Harvard, U.S.",
                 "University of Oxford, UK",
                 "9 universities",
                 "17 universities",
                 "Macau University of Science and Technology, Macao",
                 "Cairo University, Egypt",
                 "TU Dortmund University, Germany",
                 "Macau University of Science and Technology, Macao",
                 "Banasthali University, India")


z <- as.data.table(THE2021[3])
z1 <- z[, list(
  variable = "teaching",
  list_col = list(.SD$teaching))]
z <- as.data.table(THE2021[4])
z2 <- z[, list(
  variable = "research",
  list_col = list(.SD$research))]
z <- as.data.table(THE2021[5])
z3 <- z[, list(
  variable = "scores_citations",
  list_col = list(.SD$scores_citations))]
z <- as.data.table(THE2021[6])
z4 <- z[, list(
  variable = "industry_income",
  list_col = list(.SD$industry_income))]
z <- as.data.table(THE2021[7])
z5 <- z[, list(
  variable = "international",
  list_col = list(.SD$international))]
z <- as.data.table(THE2021[8])
z6 <- z[, list(
  variable = "n_students",
  list_col = list(.SD$n_students))]
z <- as.data.table(THE2021[9])
z7 <- z[, list(
  variable = "students_stuff",
  list_col = list(.SD$students_stuff))]
z <- as.data.table(THE2021[10])
z8 <- z[, list(
  variable = "pc_int_students",
  list_col = list(.SD$pc_int_students))]
z <- as.data.table(THE2021[11])
z9 <- z[, list(
  variable = "female_share",
  list_col = list(.SD$female_share))]


z <- z1 %>% 
  full_join(z2) %>% 
  full_join(z3) %>% 
  full_join(z4) %>% 
  full_join(z5) %>% 
  full_join(z6) %>% 
  full_join(z7) %>% 
  full_join(z8) %>% 
  full_join(z9) %>%
  mutate(mean = mean,
         caption_max = caption_max,
         caption_min = caption_min) %>% 
  mutate(min = str_c(min, "\n", caption_min),
         max = str_c(max, "\n", caption_max)) %>% 
  select(-caption_min,
         -caption_max)

z <- z[, c(1,3,4,5,2)]

flextable(data = z) %>%
  compose(j = "list_col", value = as_paragraph(
    plot_chunk(value = list_col, type = "dens", col = "blue", 
               width = 1.5, height = .4, free_scale = TRUE)
  )) %>%  
  set_header_labels(list_col = "distribution") %>% 
  autofit() %>% 
  add_footer(variable = "THE 2021 includes 1,448 universities from 92 countries.") %>% 
  merge_at(j = 1:5, part = "footer")
rm(z, z1, z2, z3, z4, z5, z6, z7, z8, z9, caption_max, caption_min, max, min, mean, variable)

Even this rough description proposes some dependencies: the Macau University shares the highest values both for percent of international students and international outlook. It is also obvious that top-universities, like Stanford University or MIT, should be scored relatively high on each scale.

Check for the missing values:

THE2021 %>% 
  is.na() %>% 
  sum()
## [1] 0

Scaling the variables, in order to decrease the influence of the single larger variances:

scaled_THE2021 <- as.data.frame(apply(THE2021[,3:11], 2, rescale)) %>% 
  mutate(name = THE2021$name,
         location = THE2021$location)

Correlation scores!

Before the analysis, I want to have a sense of the problem size. To do this, I construct the correlation matrix in a way inspired by Katherine Hoffman, research biostatistician from NYC:

rcorr(as.matrix(THE2021[3:11])) %>% 
  map(~data.frame(.x)) %>%
  map(~rownames_to_column(.x, var="measure1")) %>% 
  map(~pivot_longer(.x, -measure1, "measure2")) %>% 
  bind_rows(.id = "id") %>% 
  pivot_wider(names_from = id, values_from = value) %>% 
  mutate(sig_p = ifelse(P < .05, T, F),
         r_if_sig = ifelse(sig_p, r, NA)) %>% 
  select(measure1, measure2, r, r_if_sig) %>%
  mutate(text = ifelse(is.na(r_if_sig), "", round(r_if_sig, 2))) %>% 
  ggplot(aes(measure1, measure2, fill = r)) +
  geom_tile() +
  labs(x = "",
       y = "",
       title = "Correlations in THE 2021 data",
       subtitle = "only the significant scores are presented",
       fill = "Pearson's\ncorrelation\n") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  scale_fill_gradient2(low = "#FF140F",
                       mid = "#FFFAF4",
                       high = "#A8E7A2") +
  geom_text(aes(label = text), size = 3.5) +
  theme(plot.background = element_rect(fill = "#F1F3F5"),
        panel.background = element_rect(fill = "#F1F3F5"),
        legend.background = element_rect(fill = "#F1F3F5"))

As expected, one of the highest correlations is found for international characteristics (percent of students, outlook). The highest correlation is observed for teaching and research - so, it is probably a myth that being good at researching means being not that good at teaching. The medium correlation scores are found between almost all pairs of variables but what stands out is the low association of the female share, students stuff, and n_students with all the other metrics. The absence of the strong negative correlations is also noticeable.

PCA

Performance

Without undue delay, let’s run the PCA using prcomp() function:

# for plotting & further work:
pca_fit <- scaled_THE2021 %>% 
  select(where(is.numeric)) %>% 
  prcomp()

pca_fit %>%
  tidy(matrix = "eigenvalues") %>% 
  round(2) %>% 
  kable() %>% 
  kable_classic()
PC std.dev percent cumulative
1 0.41 0.51 0.51
2 0.25 0.20 0.71
3 0.19 0.12 0.82
4 0.14 0.06 0.88
5 0.12 0.04 0.93
6 0.10 0.03 0.96
7 0.08 0.02 0.98
8 0.07 0.01 0.99
9 0.05 0.01 1.00

9 variables obviously results in 9 possible components. The first principal component explains 51% of the variation, while the second one explains 20% of the remaining variation. As its cumulative explained variance is more than 0.7, I will rely on these 2 PC further. Nevertheless, it is possible to add the third one as well: its scores are also good.

The same numbers but plotted:

pca_fit %>%
  tidy(matrix = "eigenvalues") %>% 
  ggplot(aes(as.character(PC), percent)) +
  geom_bar(stat = "identity",
           fill = "light blue") +
  scale_y_continuous(
    labels = scales::percent_format()) +
  labs(x = "dimensions",
       y = "% of explained variance",
       title = "Scree plot") +
  theme_bw() +
    theme(plot.background = element_rect(fill = "#F1F3F5"),
        panel.background = element_rect(fill = "#F1F3F5"),
        legend.background = element_rect(fill = "#F1F3F5")) +
  geom_text(aes(label = round(percent, 2), vjust = -0.2))

Plotting, interpretation, story

Next, I want to plot the individuals using the selected components. To make the graph more interpretable, I decided to modify the location variable first:

  • USA, Japan, and UK are the countries over-represented: in the ranking, there are 174, 111, and 101 universities respectively. It is much bigger than for the other countries; for example, China occupies th 4th place with only 70 positions. These 3 countries were kept for the consideration - the difference in academic cultures of Japan and UK & USA can be thus analyzed.
  • The remaining universities from 89 countries were split into 2 categories. It is hard to interpret the plotting for these groups as they include diverse regions and occupy different plot parts.
locations_counted <- scaled_THE2021 %>% 
  count(location)

locations_counted$location1 = locations_counted$location
# 16 стран
middle <- locations_counted %>% 
  filter(n > 19) %>% 
  filter(n < 101) %>% 
  mutate(location1 = "countries with 20-100 universities")

low <- locations_counted %>% 
  filter(n < 20) %>% 
  mutate(location1 = "countries with less than 20 universities")

locations_counted <- locations_counted %>% 
  anti_join(middle,
            by = "location") %>% 
  full_join(middle) %>% 
  anti_join(low,
            by = "location") %>% 
  full_join(low)


scaled_THE2021 <- scaled_THE2021 %>% 
  full_join(locations_counted %>% 
              select(-n))

pca_fit %>%
  augment(scaled_THE2021) %>% # add original dataset back in
  ggplot(aes(.fittedPC1, .fittedPC2, color = location1)) + 
  geom_point(size = 1.5) +
  theme_bw() +
  labs(color = " new location",
       title = "Universities distribution among the 2 principal components",
       subtitle = "most of the universities are spread across the scales, so they are purposively poorly seen",
       x = "PC1 (51%)",
       y = "PC2 (20%)") +
  scale_color_manual(labels = c("20-70 universities in country", "<20 universities in country", "Japan", "UK", "USA"),
                     values = c("grey", "light blue", "red", "purple", "blue")) +
  theme(plot.background = element_rect(fill = "#F1F3F5"),
        panel.background = element_rect(fill = "#F1F3F5"),
        legend.background = element_rect(fill = "#F1F3F5"),
        legend.key = element_rect(fill = "#F1F3F5"))

With some caution, it can be claimed that the difference between UK and Japan was captured! The universities from the United Kingdom scored generally low on PC2 scale and lower than Japanese universities on PC1. With some variation and possible outliers, Japan holds high PC1 values and medium PC2 values. The universities from the United States are somewhere in the middle: but again, there are too many of them, so, the intersections occur and, on the whole, it is the transition from the UK cluster to Japan.

To advance our understanding of the difference, we need to understand the components better:

arrow_style <- arrow(angle = 20, 
                     ends = "first", 
                     type = "closed", 
                     length = grid::unit(8, "pt"))

pca_fit %>%
  tidy(matrix = "rotation") %>%
  pivot_wider(names_from = "PC", 
              names_prefix = "PC", 
              values_from = "value") %>%
  ggplot(aes(PC1, PC2)) +
  geom_segment(xend = 0, yend = 0, arrow = arrow_style) +
  geom_text_repel(aes(label = column),
                  hjust = 0, nudge_x = 0,
                  color = "blue") +
  xlim(-1, .4) + ylim(-.5, 1) +
  theme_bw() +
  theme(plot.background = element_rect(fill = "#F1F3F5"),
        panel.background = element_rect(fill = "#F1F3F5"),
        legend.background = element_rect(fill = "#F1F3F5")) +
  labs(x = "PC1 (51%)",
       y = "PC2 (20%)",
       title = "PCA graрh of variables")

Basing on the scores from the next table (where the 3rd PC is added for the comparison), research and teaching both contribute slightly more to the PC2 (positively) than to the PC1 (negatively). industry_income, though it is skewed and contributes to the PC1, is definitely the variable mostly responsible for the PC2 (respective loadings are -0.30 and 0.80). The ratio of students & stuff members, as well as the number of students, has no effect on the constructed PCs (even 3rd!). The share of females is opposite to the industry_income in terms of its influence on PC2. Finally, a group of international outlook & percent of international students and the citation scores together contribute to the PC1.

What is interesting (and obvious), the graph from above shows the patterns of correlations between the variables: research and teaching; scores_citations, international outlook, and percent of international students.

round(pca_fit$rotation, 2) %>% 
  as.data.frame() %>%
  select(PC1, PC2, PC3) %>% 
  kable(caption = "PCA loadings") %>% 
  kable_classic()
PCA loadings
PC1 PC2 PC3
teaching -0.29 0.20 -0.15
research -0.38 0.22 -0.07
scores_citations -0.57 -0.26 -0.68
industry_income -0.30 0.81 0.14
international -0.54 -0.35 0.61
n_students 0.00 -0.01 -0.05
students_stuff -0.01 -0.01 0.01
pc_int_students -0.25 -0.13 0.35
female_share -0.03 -0.23 -0.01

I am not that good at creating names, but what I propose is academic (PC1) vs. industry (PC2). It ignores some dimensions, like international outlook or the female share, but definitely catches something. The deal is that industry_income variable has the highest impact on the PC among all - it is described as the “indicator is of fundamental importance if you prioritise the real-world application of the science and technology research”. In the same source, it is written that this factor only becomes relevant for the students when they choose the university to apply to - than, probably, THE needs to come up with some new metrics to track this request more precisely.

plot <- pca_fit %>%
  augment(scaled_THE2021) %>% # add original dataset back in
  ggplot(aes(.fittedPC1, 
             .fittedPC2, 
             color = location1,
             text = paste(
               "Name: ", name, "\n",
               "Location: ", location, "\n",
               "Region: ", location1, "\n",
               "PC1 score: ", round(.fittedPC1,2), "\n",
               "PC2 score: ", round(.fittedPC2,2), "\n",
               sep = ""))) + 
  geom_point(size = 1.5) +
  theme_bw() +
  labs(color = "",
       title = "Interactive plotting of the PCA results",
       x = "academic (PC1, 51%)",
       y = "industry (PC2, 20%)") +
  scale_color_manual(labels = c("20-70 universities in country", "<20 universities in country", "Japan", "UK", "USA"),
                     values = c("grey", "light blue", "red", "purple", "blue")) +
  theme(plot.background = element_rect(fill = "#F1F3F5"),
        panel.background = element_rect(fill = "#F1F3F5"),
        legend.background = element_rect(fill = "#F1F3F5"),
        legend.key = element_rect(fill = "#F1F3F5")) +
  geom_segment(aes(x = 0, y = 0, xend = -0.29, yend = 0.8),
               color = "black", 
               size = 1, arrow = arrow(length = unit(0.3, "cm"))) +
  annotate("text", x= -0.29, y=0.85, label= "industry_income", size = 4) +
  geom_segment(aes(x = 0, y = 0, xend = -0.37, yend = 0.21),
               color = "black", 
               size = 1, arrow = arrow(length = unit(0.3, "cm"))) +
  annotate("text", x= -0.45, y=0.15, label= "research", size = 4) +
  geom_segment(aes(x = 0, y = 0, xend = -0.57, yend = -0.26),
               color = "black", 
               size = 1, arrow = arrow(length = unit(1, "cm"))) +
  annotate("text", x= -0.45, y=-0.31, label= "citations", size = 4)

ggplotly(plot, tooltip = "text")

The updated graph shown above suggests that the universities located on the left occupy high positions in the THE ranking: University of Oxford, Stanford University, California Institute of Technology, and so on. It also seems that the direction from c(0,0) towards left upper corner corresponds to the technologies & programming: maybe, the interplay between industry_income and research & teaching corresponds to that. Though I have not highlighted Russian universities, some of them are to be found with high industry scores Lomonosov Moscow State University, Moscow University of Physics and Tehnology, Natonal Research University of Electronic Technology. They are surrounded by Chinese, German, and South Korean universities - what a company! I searched for the HSE for a long time and understood lately that it was not presented in the data.

As for the distinction between Japanese and UK universities, I suppose, international variables play the crucial role (if you remember, these variables coincide with the citations) - the proof is in the chart below (even USA location is verified with this measure):

try <- THE2021 %>% 
  filter(location == "Japan"|
           location == "United Kingdom"|
           location == "United States")
try$caption <- ""
try[179,12] <- "University of Tsukuba"
try[21,12] <- "London School of Economics and Political Science"
#try[122,12] <- "Illinois Institute of Technology"
try[22,12] <- "Carnegie Mellon University"

try %>% 
  ggplot(aes(location, pc_int_students, fill = location)) +
  geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) +
  theme_bw() +
  labs(title = "Shares of international students in universities",
       subtitle = "THE 2021 data",
       y = "",
       x = "") +
  geom_text(aes(label = caption), vjust = 1) +
  scale_fill_manual(values = c("red", "purple", "blue")) +
  theme(legend.position = "none") +
  theme(plot.background = element_rect(fill = "#F1F3F5"),
        panel.background = element_rect(fill = "#F1F3F5"),
        legend.background = element_rect(fill = "#F1F3F5"))

(added after the submission, as well) ANOVA results suggest there is a difference between the average locations’ scores:

anova.test <- aov(pc_int_students ~ location, data = try)
summary(anova.test)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## location      2  32770   16385   201.9 <2e-16 ***
## Residuals   383  31082      81                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

To conclude, I see 2 possibilities of extending this small research:

  • to relate the plotted universities’ positions with their THE rankings,
  • to adjust for the disciplines studied in each university thus developing an argument about the industry_income.

p.s. Surprisingly, I need to confess, it was fun to work with these data.

p.s.s. I have built another plot after the submission:

scaled_new <- scaled_THE2021 %>% 
  mutate(total_score = round((teaching + research + scores_citations + industry_income + international + n_students + students_stuff + pc_int_students + female_share)/9, 2))


plot1 <- pca_fit %>%
  augment(scaled_new) %>% # add original dataset back in
  ggplot(aes(.fittedPC1, 
             .fittedPC2, 
             color = total_score,
             text = paste(
               "Name: ", name, "\n",
               "Location: ", location, "\n",
               "Region: ", location1, "\n",
               "PC1 score: ", round(.fittedPC1,2), "\n",
               "PC2 score: ", round(.fittedPC2,2), "\n",
               "Total score: ", total_score, "\n",
               sep = ""))) + 
  geom_point(size = 1.5) +
  theme_bw() +
  labs(color = "",
       title = "Plotting colored with universities' average scores",
       x = "academic (PC1, 51%)",
       y = "industry (PC2, 20%)") +
  theme(plot.background = element_rect(fill = "#F1F3F5"),
        panel.background = element_rect(fill = "#F1F3F5"),
        legend.background = element_rect(fill = "#F1F3F5"),
        legend.key = element_rect(fill = "#F1F3F5")) +
  geom_segment(aes(x = 0, y = 0, xend = -0.29, yend = 0.8),
               color = "black", 
               size = 1, arrow = arrow(length = unit(0.3, "cm"))) +
  annotate("text", x= -0.29, y=0.85, label= "industry_income", size = 4) +
  geom_segment(aes(x = 0, y = 0, xend = -0.37, yend = 0.21),
               color = "black", 
               size = 1, arrow = arrow(length = unit(0.3, "cm"))) +
  annotate("text", x= -0.45, y=0.15, label= "research", size = 4) +
  geom_segment(aes(x = 0, y = 0, xend = -0.57, yend = -0.26),
               color = "black", 
               size = 1, arrow = arrow(length = unit(0.3, "cm"))) +
  annotate("text", x= -0.45, y=-0.31, label= "citations", size = 4) +
  scale_color_gradient(low = "#e1eec3",
                        high = "#C06C84")

plot1

#ggplotly(plot1, tooltip = "text")