Wiki table

scrape

library(httr)

wiki <- GET("https://en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings")
wiki
## Response [https://en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings]
##   Date: 2020-12-12 10:20
##   Status: 200
##   Content-Type: text/html; charset=UTF-8
##   Size: 274 kB
## <!DOCTYPE html>
## <html class="client-nojs" lang="en" dir="ltr">
## <head>
## <meta charset="UTF-8"/>
## <title>Times Higher Education World University Rankings - Wikipedia</title>
## <script>document.documentElement.className="client-js";RLCONF={"wgBreakF...
## "Articles with short description","Short description matches Wikidata","...
## "wgULSCurrentAutonym":"English","wgNoticeProject":"wikipedia","wgCentral...
## "ext.gadget.switcher","ext.centralauth.centralautologin","mmv.head","mmv...
## <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.o...
## ...

load the data

library(XML)

doc1 <- readHTMLTable(
  doc = content(wiki, "text"))
#str(doc1)
top50 <- doc1[[3]] # save the third list - Times Higher Education World University Rankings—Top 50
top25 <- doc1[[4]] # save the third list - Times Higher Education World Reputation Rankings—Top 25
library(kableExtra)
top50[1:20, ] %>%
  kable() %>%
  kable_styling()
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
Institution 2010–11[42] 2011–12[43] 2012–13[44] 2013–14[45] 2014–15[46] 2015–16[47] 2016–17[48] 2017-18[49] 2018–19[50] 2019–20[51] 2020–21[52]
University of Oxford 6 4 2 2 3 2 1 1 1 1 1
Stanford University 4 2 3 4 4 3 3 3 3 4 2
Harvard University 1 2 4 2 2 6 6 6 6 7 3
California Institute of Technology 2 1 1 1 1 1 2 3 5 2 4
Massachusetts Institute of Technology 3 7 5 5 6 5 5 5 4 5 5
University of Cambridge 6 6 7 7 5 4 4 2 2 3 6
University of California, Berkeley 8 10 9 8 8 13 10 18 15 13 7
Yale University 10 11 11 11 9 12 12 12 8 8 8
Princeton University 5 5 6 6 7 7 7 7 7 6 9
University of Chicago 13 9 10 9 11 10 10 9 10 9 10
Imperial College London 9 8 8 10 9 8 8 8 9 10 11
Johns Hopkins University 13 14 16 15 15 11 17 13 12 12 12
University of Pennsylvania 1 16 15 16 16 17 13 10 12 11 13
Swiss Federal Institute of Technology in Zurich 15 15 12 14 13 9 9 10 11 13 14
University of California, Los Angeles 11 13 13 12 12 16 14 15 17 17 15
University College London 22 17 17 21 22 14 15 16 14 15 16
Columbia University 18 12 14 13 14 15 16 14 16 16 17
University of Toronto 17 19 21 20 20 19 22 22 21 18 18
Cornell University 14 20 18 19 19 18 19 19 19 19 19
tail(top50)

rewrite the headings, clean and check

#library(datapasta)
#vector_paste()

top50$V2[top50$V2 == "-"] <- NA
top50$V3[top50$V3 == "-"] <- NA
top50$V4[top50$V4 == "-"] <- NA
top50$V5[top50$V5 == "-"] <- NA
top50$V6[top50$V6 == "-"] <- NA
top50$V7[top50$V7 == "-"] <- NA
top50$V8[top50$V8 == "-"] <- NA

names(top50) <- c("Institution", "y2010", "y2011", "y2012", "y2013", "y2014", "y2015", "y2016", "y2017", "y2018", "y2019", "y2020")
top50 <- top50[-1, ]
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
top50 <- top50 %>% 
  mutate_if(sapply(top50, is.factor), as.character)
head(top50)
head(top50)
tail(top50)
str(top50)
## 'data.frame':    50 obs. of  12 variables:
##  $ Institution: chr  "University of Oxford" "Stanford University" "Harvard University" "California Institute of Technology" ...
##  $ y2010      : chr  "6" "4" "1" "2" ...
##  $ y2011      : chr  "4" "2" "2" "1" ...
##  $ y2012      : chr  "2" "3" "4" "1" ...
##  $ y2013      : chr  "2" "4" "2" "1" ...
##  $ y2014      : chr  "3" "4" "2" "1" ...
##  $ y2015      : chr  "2" "3" "6" "1" ...
##  $ y2016      : chr  "1" "3" "6" "2" ...
##  $ y2017      : chr  "1" "3" "6" "3" ...
##  $ y2018      : chr  "1" "3" "6" "5" ...
##  $ y2019      : chr  "1" "4" "7" "2" ...
##  $ y2020      : chr  "1" "2" "3" "4" ...

the final table

top50 %>%
  kable() %>%
  kable_styling()
Institution y2010 y2011 y2012 y2013 y2014 y2015 y2016 y2017 y2018 y2019 y2020
University of Oxford 6 4 2 2 3 2 1 1 1 1 1
Stanford University 4 2 3 4 4 3 3 3 3 4 2
Harvard University 1 2 4 2 2 6 6 6 6 7 3
California Institute of Technology 2 1 1 1 1 1 2 3 5 2 4
Massachusetts Institute of Technology 3 7 5 5 6 5 5 5 4 5 5
University of Cambridge 6 6 7 7 5 4 4 2 2 3 6
University of California, Berkeley 8 10 9 8 8 13 10 18 15 13 7
Yale University 10 11 11 11 9 12 12 12 8 8 8
Princeton University 5 5 6 6 7 7 7 7 7 6 9
University of Chicago 13 9 10 9 11 10 10 9 10 9 10
Imperial College London 9 8 8 10 9 8 8 8 9 10 11
Johns Hopkins University 13 14 16 15 15 11 17 13 12 12 12
University of Pennsylvania 1 16 15 16 16 17 13 10 12 11 13
Swiss Federal Institute of Technology in Zurich 15 15 12 14 13 9 9 10 11 13 14
University of California, Los Angeles 11 13 13 12 12 16 14 15 17 17 15
University College London 22 17 17 21 22 14 15 16 14 15 16
Columbia University 18 12 14 13 14 15 16 14 16 16 17
University of Toronto 17 19 21 20 20 19 22 22 21 18 18
Cornell University 14 20 18 19 19 18 19 19 19 19 19
Duke University 24 22 23 17 18 20 18 17 18 20 20
Tsinghua University 58 71 52 50 49 47 35 30 22 23 20
University of Michigan 15 18 20 18 17 21 21 21 20 21 22
Peking University 37 49 46 45 48 42 29 27 31 24 23
Northwestern University 25 26 19 22 21 25 20 20 25 22 24
National University of Singapore 34 40 29 26 25 26 24 22 23 25 25
New York University 60 44 41 40 38 30 32 27 27 29 26
London School of Economics and Political Science 86 47 39 32 34 23 25 25 26 27 27
Carnegie Mellon University 20 21 22 24 24 22 23 24 24 27 28
University of Washington 23 25 24 25 26 32 25 25 28 26 29
University of Edinburgh 40 36 32 39 36 24 27 27 29 30 30
University of Melbourne 36 37 28 34 33 33 33 32 32 32 31
Ludwig Maximilian University of Munich 61 45 48 55 29 29 30 34 32 32 32
University of California, San Diego 32 33 38 40 41 39 41 31 30 31 33
University of British Columbia 30 22 30 31 32 34 36 34 37 34 34
King’s College London 77 56 57 38 40 27 36 36 38 36 35
Karolinska Institute 43 32 42 36 44 28 28 38 40 41 36
University of Tokyo 26 30 27 23 23 43 39 46 42 36 36
Georgia Institute of Technology 27 24 25 28 27 41 33 33 34 38 38
University of Hong Kong 21 34 35 43 43 44 43 40 36 35 39
McGill University 35 28 34 35 39 38 42 42 44 42 40
Technical University of Munich 101 88 105 87 98 53 46 41 44 43 41
Heidelberg University 83 73 78 68 70 37 43 45 47 44 42
École Polytechnique Fédérale de Lausanne 48 46 40 37 34 31 30 38 35 38 43
University of Texas at Austin NA 29 25 27 28 46 50 49 39 38 44
Katholieke Universiteit Leuven 119 67 58 61 55 35 40 47 48 45 45
Université Paris Sciences et Lettres NA NA NA NA NA NA NA 72 41 45 46
Nanyang Technological University 174 169 86 76 61 55 54 52 51 48 47
University of Illinois at Urbana–Champaign 33 31 33 29 29 36 36 37 50 48 48
University of Wisconsin-Madison 43 27 31 30 29 50 45 43 43 51 49
Washington University in St. Louis 38 41 44 42 42 60 57 50 54 52 50
#library(labelled)
#variable.names(top50)
#var.labels <- c(Institution = "",
#                y2010 = "2010-2011",
#                y2011 = "2011-2012",
#                y2012 = "2012-2013",
#                y2013 = "2013-2014",
#                y2014 = "2014-2015",
#                y2015 = "2015-2016",
#                y2016 = "2016-2017",
#                y2017 = "2017-2018",
#                y2018 = "2018-2019",
#                y2019 = "2019-2020",
#                y2020 = "2020-2021")
#top50 <- labelled::set_variable_labels(top50, .labels = var.labels)
#str(top50)
#head(top50)

save the table and check how that works

write.csv(top50, file = "top50.csv")
df <- read.csv("top50.csv", row.names = NULL) 
head(df, 20)
str(df)
## 'data.frame':    50 obs. of  13 variables:
##  $ X          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Institution: Factor w/ 50 levels "California Institute of Technology",..: 42 25 8 1 17 35 32 50 24 36 ...
##  $ y2010      : int  6 4 1 2 3 6 8 10 5 13 ...
##  $ y2011      : int  4 2 2 1 7 6 10 11 5 9 ...
##  $ y2012      : int  2 3 4 1 5 7 9 11 6 10 ...
##  $ y2013      : int  2 4 2 1 5 7 8 11 6 9 ...
##  $ y2014      : int  3 4 2 1 6 5 8 9 7 11 ...
##  $ y2015      : int  2 3 6 1 5 4 13 12 7 10 ...
##  $ y2016      : int  1 3 6 2 5 4 10 12 7 10 ...
##  $ y2017      : int  1 3 6 3 5 2 18 12 7 9 ...
##  $ y2018      : int  1 3 6 5 4 2 15 8 7 10 ...
##  $ y2019      : int  1 4 7 2 5 3 13 8 6 9 ...
##  $ y2020      : int  1 2 3 4 5 6 7 8 9 10 ...
#top50 <- df %>% select("Institution", "y2010", "y2011", "y2012", "y2013", "y2014", "y2015", "y2016", "y2017", #"y2018", "y2019", "y2020")
#top50 <- top50 %>% 
#  mutate_if(sapply(top50, is.factor), as.character)
#head(top50, 20)
#str(top50)
#write.csv(top50, file = "top50.csv")
#df <- read.csv("top50.csv", row.names = NULL) 
#str(df)

Slope graph

top50_1 <- top50 %>% select("Institution", "y2010", "y2020")
str(top50_1)
## 'data.frame':    50 obs. of  3 variables:
##  $ Institution: chr  "University of Oxford" "Stanford University" "Harvard University" "California Institute of Technology" ...
##  $ y2010      : chr  "6" "4" "1" "2" ...
##  $ y2020      : chr  "1" "2" "3" "4" ...
head(top50_1)
top50_10 <- top50_1[1:10,]
top50new <- broom::fix_data_frame(
   t(top50_10), 
   newcol = "Year")
## Warning: This function is deprecated as of broom 0.7.0 and will be removed
## from a future release. Please see tibble::as_tibble().
top50new
colnames(top50new) <- top50new[1, ]
top50new <- top50new[-1, ]
top50new
top50new$Year <- top50new$Institution 
top50new <- top50new[, -1]
str(top50new)
## tibble [2 x 11] (S3: tbl_df/tbl/data.frame)
##  $ University of Oxford                 : chr [1:2] "6" "1"
##  $ Stanford University                  : chr [1:2] "4" "2"
##  $ Harvard University                   : chr [1:2] "1" "3"
##  $ California Institute of Technology   : chr [1:2] "2" "4"
##  $ Massachusetts Institute of Technology: chr [1:2] "3" "5"
##  $ University of Cambridge              : chr [1:2] "6" "6"
##  $ University of California, Berkeley   : chr [1:2] "8" "7"
##  $ Yale University                      : chr [1:2] "10" "8"
##  $ Princeton University                 : chr [1:2] "5" "9"
##  $ University of Chicago                : chr [1:2] "13" "10"
##  $ Year                                 : chr [1:2] "y2010" "y2020"
newtop50 <- reshape2::melt(top50new, id="Year", variable.name="Institution", value.name = "Rate")
newtop50
newtop50$Year <- factor(newtop50$Year, 
                         levels = c("y2010", "y2020"), 
                         labels = c("2010-11","2020-21"), 
                         ordered = TRUE)
head(newtop50)
library(ggplot2)
require(ggrepel)
## Loading required package: ggrepel
ggplot(data = newtop50, aes(x = Year, y = Rate, group = Institution)) +
  geom_line(aes(color = Institution, alpha = 1), size = 1)+
#  geom_text_repel(data = newtop50 %>% filter(Year == "2010-11"), 
#                  aes(label = Institution) , 
#                  hjust = "left", 
#                  fontface = "bold", 
#                  size = 3, 
#                  nudge_x = -.45, 
#                  direction = "y") +
#  geom_text_repel(data = newtop50 %>% filter(Year == "2020-21"), 
#                  aes(label = Institution) , 
#                  hjust = "right", 
#                  fontface = "bold", 
#                  size = 3, 
#                  nudge_x = .5, 
#                  direction = "y") +
  geom_label(aes(label = Rate), 
             size = 2.5, 
             label.padding = unit(0.05, "lines"), 
             label.size = 0.0) +
  labs(
    title = "The dinamics of top-10 Times Higher Education World University Rankings",
    subtitle = "Based on: the most reliable source",
    caption = "en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings"
  ) +
  theme_bw() 

As the deadline is already passed (i was sure it was till 00.00, but i’ve noticed the real timing when it was too late, oh), I’ll write not much but still my small insight and comments on the works I’ve done.

First, sorry for another delay. I felt really bad the last couple of days and it was difficult for me to handle this work on time. Anyway, better late than never, I’ve kinda handled it (and now i feel at least a little bit happy).

I’ve decided to show you the table with the whole top-50 (just in case) and as for the graph I’ve placed 10 universities there.

From the graph we see that :

  • In general top10 have not really changed. Institutions, which are on top for today had nearly the same ratings before. Only University of Chicago was not presented in top-10 for 2010-11 years, it was on the 13th place in that rating. Now it is the 10th one.

  • The first two leading universities (Oxford and Stanford) had a really nice jump from 4th and 6th places to the 2nd and the 1st ones, moving the three leaders to later places (3 to 5).

  • Only one university shows some strong decrease in its rating - Princeton lost its 5th place moving to the 9th one.

For some interest let’s also have a look on extended a bit version of the same statistics: I’ve added some middle years - 2013 and 2016.

top50_1 <- top50 %>% select("Institution", "y2010", "y2013", "y2016", "y2020")
#str(top50_1)
#head(top50_1)
top50_10 <- top50_1[1:10,]
top50new <- broom::fix_data_frame(
   t(top50_10), 
   newcol = "Year")
## Warning: This function is deprecated as of broom 0.7.0 and will be removed
## from a future release. Please see tibble::as_tibble().
#top50new
colnames(top50new) <- top50new[1, ]
top50new <- top50new[-1, ]
#top50new
top50new$Year <- top50new$Institution 
top50new <- top50new[, -1]
#str(top50new)
newtop50 <- reshape2::melt(top50new, id="Year", variable.name="Institution", value.name = "Rate")
#newtop50
newtop50$Year <- factor(newtop50$Year, 
                         levels = c("y2010", "y2013", "y2016", "y2020"), 
                         labels = c("2010-11", "2013-14", "2016-17", "2020-21"), 
                         ordered = TRUE)
#head(newtop50)
ggplot(data = newtop50, aes(x = Year, y = Rate, group = Institution)) +
  geom_line(aes(color = Institution, alpha = 1), size = 1)+
  geom_label(aes(label = Rate), 
             size = 2.5, 
             label.padding = unit(0.05, "lines"), 
             label.size = 0.0) +
  labs(
    title = "The dinamics of top-10 Times Higher Education World University Rankings\n(4 timepoints)",
    subtitle = "Based on: the most reliable source",
    caption = "en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings"
  ) +
  theme_bw() 

Here we see a better look on the rapid jump of Oxford University, smoother slopes of Stanford, Princeton and two universities of Technology (both Californian and the one from Massachusetts), and also some highly fluctuated institutions like Harvard, Cambridge, Berkeley and Yale, which has dropped out of top-10 in the middle of the decade, but rapidly came back in its last half.