Scraping data

library(httr)
library(ggplot2)
library(kableExtra)
library(ggrepel)
library(XML)

di <- GET("https://en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings")
di
## Response [https://en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings]
##   Date: 2020-12-12 12:49
##   Status: 200
##   Content-Type: text/html; charset=UTF-8
##   Size: 274 kB
## <!DOCTYPE html>
## <html class="client-nojs" lang="en" dir="ltr">
## <head>
## <meta charset="UTF-8"/>
## <title>Times Higher Education World University Rankings - Wikipedia</title>
## <script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames...
## "Articles with short description","Short description matches Wikidata","Use B...
## "wgULSCurrentAutonym":"English","wgNoticeProject":"wikipedia","wgCentralAuthM...
## "ext.gadget.switcher","ext.centralauth.centralautologin","mmv.head","mmv.boot...
## <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.option...
## ...
doc <- readHTMLTable(
  doc = content(di, "text"))
str(doc)
## List of 7
##  $ Times Higher Education World University Rankings                               :'data.frame': 9 obs. of  2 variables:
##   ..$ V1: chr [1:9] "" "Editor" "Categories" "Frequency" ...
##   ..$ V2: chr [1:9] NA "Phil Baty" "Higher education" "Annual" ...
##  $ NULL                                                                           :'data.frame': 6 obs. of  3 variables:
##   ..$ V1: chr [1:6] "Overall indicator" "Industry Income – innovation" "International diversity" "Teaching – the learning environment" ...
##   ..$ V2: chr [1:6] "Individual indicator" "Research income from industry (per academic staff)" "Ratio of international to domestic staff\nRatio of international to domestic students" "Reputational survey (teaching)\nPhDs awards per academic\nUndergrad. admitted per academic\nIncome per academic"| __truncated__ ...
##   ..$ V3: chr [1:6] "Percentage weighting" "2.5%" "3%\n2%" "15%\n6%\n4.5%\n2.25%\n2.25%" ...
##  $ Times Higher Education World University Rankings—Top 50[Note 1]                :'data.frame': 51 obs. of  12 variables:
##   ..$ V1 : chr [1:51] "Institution" "University of Oxford" "Stanford University" "Harvard University" ...
##   ..$ V2 : chr [1:51] "2010–11[42]" "6" "4" "1" ...
##   ..$ V3 : chr [1:51] "2011–12[43]" "4" "2" "2" ...
##   ..$ V4 : chr [1:51] "2012–13[44]" "2" "3" "4" ...
##   ..$ V5 : chr [1:51] "2013–14[45]" "2" "4" "2" ...
##   ..$ V6 : chr [1:51] "2014–15[46]" "3" "4" "2" ...
##   ..$ V7 : chr [1:51] "2015–16[47]" "2" "3" "6" ...
##   ..$ V8 : chr [1:51] "2016–17[48]" "1" "3" "6" ...
##   ..$ V9 : chr [1:51] "2017-18[49]" "1" "3" "6" ...
##   ..$ V10: chr [1:51] "2018–19[50]" "1" "3" "6" ...
##   ..$ V11: chr [1:51] "2019–20[51]" "1" "4" "7" ...
##   ..$ V12: chr [1:51] "2020–21[52]" "1" "2" "3" ...
##  $ Times Higher Education World Reputation Rankings—Top 25[Note 1]                :'data.frame': 26 obs. of  11 variables:
##   ..$ V1 : chr [1:26] "Institution" "Harvard University" "Massachusetts Institute of Technology" "Stanford University" ...
##   ..$ V2 : chr [1:26] "2011[57]" "1" "2" "5" ...
##   ..$ V3 : chr [1:26] "2012[58]" "1" "2" "4" ...
##   ..$ V4 : chr [1:26] "2013[59]" "1" "2" "6" ...
##   ..$ V5 : chr [1:26] "2014[60]" "1" "2" "3" ...
##   ..$ V6 : chr [1:26] "2015[61]" "1" "4" "5" ...
##   ..$ V7 : chr [1:26] "2016[62]" "1" "2" "3" ...
##   ..$ V8 : chr [1:26] "2017[63]" "1" "2" "3" ...
##   ..$ V9 : chr [1:26] "2018[64]" "1" "2" "3" ...
##   ..$ V10: chr [1:26] "2019[65]" "1" "2" "3" ...
##   ..$ V11: chr [1:26] "2020[66]" "1" "2" "3" ...
##  $ Times Higher Education Asia University Rankings as shown below – Top 20[Note 1]:'data.frame': 21 obs. of  9 variables:
##   ..$ V1: chr [1:21] "Institution" "Tsinghua University" "Peking University" "National University of Singapore" ...
##   ..$ V2: chr [1:21] "2013[68]" "6" "4" "2" ...
##   ..$ V3: chr [1:21] "2014[69]" "6" "5" "2" ...
##   ..$ V4: chr [1:21] "2015[70]" "5" "4" "2" ...
##   ..$ V5: chr [1:21] "2016[67]" "5" "2" "1" ...
##   ..$ V6: chr [1:21] "2017[71]" "3" "2" "1" ...
##   ..$ V7: chr [1:21] "2018[72]" "2" "3" "1" ...
##   ..$ V8: chr [1:21] "2019[73]" "1" "5" "2" ...
##   ..$ V9: chr [1:21] "2020[74]" "1" "2" "3" ...
##  $ Times Higher Education BRICS & Emerging Economies Rankings – Top 20[Note 1]    :'data.frame': 21 obs. of  8 variables:
##   ..$ V1: chr [1:21] "Institution" "Tsinghua University" "Peking University" "Zhejiang University" ...
##   ..$ V2: chr [1:21] "2014[75]" "2" "1" "22" ...
##   ..$ V3: chr [1:21] "2015[76]" "2" "1" "21" ...
##   ..$ V4: chr [1:21] "2016[77]" "2" "1" "8" ...
##   ..$ V5: chr [1:21] "2017[78]" "2" "1" "9" ...
##   ..$ V6: chr [1:21] "2018[79]" "2" "1" "6" ...
##   ..$ V7: chr [1:21] "2019[80]" "1" "2" "3" ...
##   ..$ V8: chr [1:21] "2020[81]" "1" "2" "3" ...
##  $ NULL                                                                           :'data.frame': 6 obs. of  2 variables:
##   ..$ V1: chr [1:6] ".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{fl"| __truncated__ "Global" "Regional" "National" ...
##   ..$ V2: chr [1:6] NA "Academic Ranking of World Universities\nCWTS Leiden Ranking\nGlobal University Ranking\nMines ParisTech: Profes"| __truncated__ "European Union\nLatin America\nSouth East Asia" "Australia\nExcellence in Research for Australia\nBrazil\nCanada\nMaclean's\nChina\nBCUR\nWu Shulian\nCUAA\nNetb"| __truncated__ ...
Edurank <- doc[[3]]
Edurank[1:12, ] %>%
  kable() %>%
  kable_styling()
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
Institution 2010–11[42] 2011–12[43] 2012–13[44] 2013–14[45] 2014–15[46] 2015–16[47] 2016–17[48] 2017-18[49] 2018–19[50] 2019–20[51] 2020–21[52]
University of Oxford 6 4 2 2 3 2 1 1 1 1 1
Stanford University 4 2 3 4 4 3 3 3 3 4 2
Harvard University 1 2 4 2 2 6 6 6 6 7 3
California Institute of Technology 2 1 1 1 1 1 2 3 5 2 4
Massachusetts Institute of Technology 3 7 5 5 6 5 5 5 4 5 5
University of Cambridge 6 6 7 7 5 4 4 2 2 3 6
University of California, Berkeley 8 10 9 8 8 13 10 18 15 13 7
Yale University 10 11 11 11 9 12 12 12 8 8 8
Princeton University 5 5 6 6 7 7 7 7 7 6 9
University of Chicago 13 9 10 9 11 10 10 9 10 9 10
Imperial College London 9 8 8 10 9 8 8 8 9 10 11
tail(Edurank)
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
46 Katholieke Universiteit Leuven 119 67 58 61 55 35 40 47 48 45 45
47 Universite Paris Sciences et Lettres
72 41 45 46
48 Nanyang Technological University 174 169 86 76 61 55 54 52 51 48 47
49 University of Illinois at Urbana–Champaign 33 31 33 29 29 36 36 37 50 48 48
50 University of Wisconsin-Madison 43 27 31 30 29 50 45 43 43 51 49
51 Washington University in St. Louis 38 41 44 42 42 60 57 50 54 52 50

Working on table

names(Edurank) <- c("Institution", "2011", "2012", "2013",  "2014", "2015", "2016", "2017","2018", "2019", "2020", "2021")
Edurank <- Edurank[-1, ]

rownames(Edurank) <- NULL
head(Edurank, 10)
Institution 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021
University of Oxford 6 4 2 2 3 2 1 1 1 1 1
Stanford University 4 2 3 4 4 3 3 3 3 4 2
Harvard University 1 2 4 2 2 6 6 6 6 7 3
California Institute of Technology 2 1 1 1 1 1 2 3 5 2 4
Massachusetts Institute of Technology 3 7 5 5 6 5 5 5 4 5 5
University of Cambridge 6 6 7 7 5 4 4 2 2 3 6
University of California, Berkeley 8 10 9 8 8 13 10 18 15 13 7
Yale University 10 11 11 11 9 12 12 12 8 8 8
Princeton University 5 5 6 6 7 7 7 7 7 6 9
University of Chicago 13 9 10 9 11 10 10 9 10 9 10
tail(Edurank)
Institution 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021
45 Katholieke Universiteit Leuven 119 67 58 61 55 35 40 47 48 45 45
46 Universite Paris Sciences et Lettres
72 41 45 46
47 Nanyang Technological University 174 169 86 76 61 55 54 52 51 48 47
48 University of Illinois at Urbana–Champaign 33 31 33 29 29 36 36 37 50 48 48
49 University of Wisconsin-Madison 43 27 31 30 29 50 45 43 43 51 49
50 Washington University in St. Louis 38 41 44 42 42 60 57 50 54 52 50

Saving table

write.csv(Edurank, file = "Uni-ranks.csv", row.names = FALSE)
df <- read.csv("Uni-ranks.csv", row.names = NULL, header=T,check.names=F) 
head(df, 20)
Institution 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021
University of Oxford 6 4 2 2 3 2 1 1 1 1 1
Stanford University 4 2 3 4 4 3 3 3 3 4 2
Harvard University 1 2 4 2 2 6 6 6 6 7 3
California Institute of Technology 2 1 1 1 1 1 2 3 5 2 4
Massachusetts Institute of Technology 3 7 5 5 6 5 5 5 4 5 5
University of Cambridge 6 6 7 7 5 4 4 2 2 3 6
University of California, Berkeley 8 10 9 8 8 13 10 18 15 13 7
Yale University 10 11 11 11 9 12 12 12 8 8 8
Princeton University 5 5 6 6 7 7 7 7 7 6 9
University of Chicago 13 9 10 9 11 10 10 9 10 9 10
Imperial College London 9 8 8 10 9 8 8 8 9 10 11
Johns Hopkins University 13 14 16 15 15 11 17 13 12 12 12
University of Pennsylvania 1 16 15 16 16 17 13 10 12 11 13
Swiss Federal Institute of Technology in Zurich 15 15 12 14 13 9 9 10 11 13 14
University of California, Los Angeles 11 13 13 12 12 16 14 15 17 17 15
University College London 22 17 17 21 22 14 15 16 14 15 16
Columbia University 18 12 14 13 14 15 16 14 16 16 17
University of Toronto 17 19 21 20 20 19 22 22 21 18 18
Cornell University 14 20 18 19 19 18 19 19 19 19 19
Duke University 24 22 23 17 18 20 18 17 18 20 20
str(df)
## 'data.frame':    50 obs. of  12 variables:
##  $ Institution: chr  "University of Oxford" "Stanford University" "Harvard University" "California Institute of Technology" ...
##  $ 2011       : chr  "6" "4" "1" "2" ...
##  $ 2012       : chr  "4" "2" "2" "1" ...
##  $ 2013       : chr  "2" "3" "4" "1" ...
##  $ 2014       : chr  "2" "4" "2" "1" ...
##  $ 2015       : chr  "3" "4" "2" "1" ...
##  $ 2016       : chr  "2" "3" "6" "1" ...
##  $ 2017       : chr  "1" "3" "6" "2" ...
##  $ 2018       : int  1 3 6 3 5 2 18 12 7 9 ...
##  $ 2019       : int  1 3 6 5 4 2 15 8 7 10 ...
##  $ 2020       : int  1 4 7 2 5 3 13 8 6 9 ...
##  $ 2021       : int  1 2 3 4 5 6 7 8 9 10 ...

Manipulating data

df1= df %>% dplyr::select(Institution, `2011`,`2021` ) 

df1$`2011`=as.numeric(as.character(df1$`2011`))
df1$`2021`=as.numeric(as.character(df1$`2021`))
df1 = df1 %>% dplyr::arrange((`2021`))
df1.1 = head(df1, 10)

Changing data format

library(reshape2)
d <- melt(df1.1, id.vars = 'Institution')

d$year = d$variable
d$rang = d$value
df2=d %>% dplyr::select(Institution, rang, year)
df2$rang = as.numeric(as.character(df2$rang))
df2$year = as.factor(df2$year)

Building a slopegraph

MySpecial <- list(  
  scale_x_discrete(position = "top"),
  theme_bw(),
  theme(legend.position = "none"),
  theme(panel.border     = element_blank()),
  theme(axis.title.y     = element_blank()),
  theme(axis.text.y      = element_blank()),
  theme(panel.grid.major.y = element_blank()),
  theme(panel.grid.minor.y = element_blank()),
  theme(axis.title.x     = element_blank()),
  theme(panel.grid.major.x = element_blank()),
  theme(axis.text.x.top      = element_text(size=12)),
  theme(axis.ticks       = element_blank()),
  theme(plot.title       = element_text(size=14, face = "bold", hjust = 0.5)),
  theme(plot.subtitle    = element_text(hjust = 0.5))
)
ggplot(data = df2, aes(x = year, y = rang, group = Institution)) +
  geom_line(aes(color = Institution, alpha = 1), size = 1) +
  geom_text_repel(data = df2 %>% dplyr::filter(year == "2011"), 
                  aes(label = Institution) , 
                  hjust = "left", 
                  fontface = "bold", 
                  size = 3, 
                  nudge_x = -.45, 
                  direction = "y") +
  geom_text_repel(data = df2 %>% dplyr::filter(year == "2021"), 
                  aes(label = Institution) , 
                  hjust = "right", 
                  fontface = "bold", 
                  size = 3, 
                  nudge_x = .5, 
                  direction = "y") +
  geom_label(aes(label = rang), 
             size = 2.5, 
             label.padding = unit(0.05, "lines"), 
             label.size = 0.0) +
  MySpecial +
  labs(
    title = "Trends in rankings for TOP-10 universities 2011-2021",
    subtitle = "Most universities have imroved their ranks by 10 years",
    caption = "https://en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings"
  )