Scraping data
library(httr)
library(ggplot2)
library(kableExtra)
library(ggrepel)
library(XML)
di <- GET("https://en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings")
di
## Response [https://en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings]
## Date: 2020-12-12 12:49
## Status: 200
## Content-Type: text/html; charset=UTF-8
## Size: 274 kB
## <!DOCTYPE html>
## <html class="client-nojs" lang="en" dir="ltr">
## <head>
## <meta charset="UTF-8"/>
## <title>Times Higher Education World University Rankings - Wikipedia</title>
## <script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames...
## "Articles with short description","Short description matches Wikidata","Use B...
## "wgULSCurrentAutonym":"English","wgNoticeProject":"wikipedia","wgCentralAuthM...
## "ext.gadget.switcher","ext.centralauth.centralautologin","mmv.head","mmv.boot...
## <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.option...
## ...
doc <- readHTMLTable(
doc = content(di, "text"))
str(doc)
## List of 7
## $ Times Higher Education World University Rankings :'data.frame': 9 obs. of 2 variables:
## ..$ V1: chr [1:9] "" "Editor" "Categories" "Frequency" ...
## ..$ V2: chr [1:9] NA "Phil Baty" "Higher education" "Annual" ...
## $ NULL :'data.frame': 6 obs. of 3 variables:
## ..$ V1: chr [1:6] "Overall indicator" "Industry Income – innovation" "International diversity" "Teaching – the learning environment" ...
## ..$ V2: chr [1:6] "Individual indicator" "Research income from industry (per academic staff)" "Ratio of international to domestic staff\nRatio of international to domestic students" "Reputational survey (teaching)\nPhDs awards per academic\nUndergrad. admitted per academic\nIncome per academic"| __truncated__ ...
## ..$ V3: chr [1:6] "Percentage weighting" "2.5%" "3%\n2%" "15%\n6%\n4.5%\n2.25%\n2.25%" ...
## $ Times Higher Education World University Rankings—Top 50[Note 1] :'data.frame': 51 obs. of 12 variables:
## ..$ V1 : chr [1:51] "Institution" "University of Oxford" "Stanford University" "Harvard University" ...
## ..$ V2 : chr [1:51] "2010–11[42]" "6" "4" "1" ...
## ..$ V3 : chr [1:51] "2011–12[43]" "4" "2" "2" ...
## ..$ V4 : chr [1:51] "2012–13[44]" "2" "3" "4" ...
## ..$ V5 : chr [1:51] "2013–14[45]" "2" "4" "2" ...
## ..$ V6 : chr [1:51] "2014–15[46]" "3" "4" "2" ...
## ..$ V7 : chr [1:51] "2015–16[47]" "2" "3" "6" ...
## ..$ V8 : chr [1:51] "2016–17[48]" "1" "3" "6" ...
## ..$ V9 : chr [1:51] "2017-18[49]" "1" "3" "6" ...
## ..$ V10: chr [1:51] "2018–19[50]" "1" "3" "6" ...
## ..$ V11: chr [1:51] "2019–20[51]" "1" "4" "7" ...
## ..$ V12: chr [1:51] "2020–21[52]" "1" "2" "3" ...
## $ Times Higher Education World Reputation Rankings—Top 25[Note 1] :'data.frame': 26 obs. of 11 variables:
## ..$ V1 : chr [1:26] "Institution" "Harvard University" "Massachusetts Institute of Technology" "Stanford University" ...
## ..$ V2 : chr [1:26] "2011[57]" "1" "2" "5" ...
## ..$ V3 : chr [1:26] "2012[58]" "1" "2" "4" ...
## ..$ V4 : chr [1:26] "2013[59]" "1" "2" "6" ...
## ..$ V5 : chr [1:26] "2014[60]" "1" "2" "3" ...
## ..$ V6 : chr [1:26] "2015[61]" "1" "4" "5" ...
## ..$ V7 : chr [1:26] "2016[62]" "1" "2" "3" ...
## ..$ V8 : chr [1:26] "2017[63]" "1" "2" "3" ...
## ..$ V9 : chr [1:26] "2018[64]" "1" "2" "3" ...
## ..$ V10: chr [1:26] "2019[65]" "1" "2" "3" ...
## ..$ V11: chr [1:26] "2020[66]" "1" "2" "3" ...
## $ Times Higher Education Asia University Rankings as shown below – Top 20[Note 1]:'data.frame': 21 obs. of 9 variables:
## ..$ V1: chr [1:21] "Institution" "Tsinghua University" "Peking University" "National University of Singapore" ...
## ..$ V2: chr [1:21] "2013[68]" "6" "4" "2" ...
## ..$ V3: chr [1:21] "2014[69]" "6" "5" "2" ...
## ..$ V4: chr [1:21] "2015[70]" "5" "4" "2" ...
## ..$ V5: chr [1:21] "2016[67]" "5" "2" "1" ...
## ..$ V6: chr [1:21] "2017[71]" "3" "2" "1" ...
## ..$ V7: chr [1:21] "2018[72]" "2" "3" "1" ...
## ..$ V8: chr [1:21] "2019[73]" "1" "5" "2" ...
## ..$ V9: chr [1:21] "2020[74]" "1" "2" "3" ...
## $ Times Higher Education BRICS & Emerging Economies Rankings – Top 20[Note 1] :'data.frame': 21 obs. of 8 variables:
## ..$ V1: chr [1:21] "Institution" "Tsinghua University" "Peking University" "Zhejiang University" ...
## ..$ V2: chr [1:21] "2014[75]" "2" "1" "22" ...
## ..$ V3: chr [1:21] "2015[76]" "2" "1" "21" ...
## ..$ V4: chr [1:21] "2016[77]" "2" "1" "8" ...
## ..$ V5: chr [1:21] "2017[78]" "2" "1" "9" ...
## ..$ V6: chr [1:21] "2018[79]" "2" "1" "6" ...
## ..$ V7: chr [1:21] "2019[80]" "1" "2" "3" ...
## ..$ V8: chr [1:21] "2020[81]" "1" "2" "3" ...
## $ NULL :'data.frame': 6 obs. of 2 variables:
## ..$ V1: chr [1:6] ".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{fl"| __truncated__ "Global" "Regional" "National" ...
## ..$ V2: chr [1:6] NA "Academic Ranking of World Universities\nCWTS Leiden Ranking\nGlobal University Ranking\nMines ParisTech: Profes"| __truncated__ "European Union\nLatin America\nSouth East Asia" "Australia\nExcellence in Research for Australia\nBrazil\nCanada\nMaclean's\nChina\nBCUR\nWu Shulian\nCUAA\nNetb"| __truncated__ ...
Edurank[1:12, ] %>%
kable() %>%
kable_styling()
|
V1
|
V2
|
V3
|
V4
|
V5
|
V6
|
V7
|
V8
|
V9
|
V10
|
V11
|
V12
|
|
Institution
|
2010–11[42]
|
2011–12[43]
|
2012–13[44]
|
2013–14[45]
|
2014–15[46]
|
2015–16[47]
|
2016–17[48]
|
2017-18[49]
|
2018–19[50]
|
2019–20[51]
|
2020–21[52]
|
|
University of Oxford
|
6
|
4
|
2
|
2
|
3
|
2
|
1
|
1
|
1
|
1
|
1
|
|
Stanford University
|
4
|
2
|
3
|
4
|
4
|
3
|
3
|
3
|
3
|
4
|
2
|
|
Harvard University
|
1
|
2
|
4
|
2
|
2
|
6
|
6
|
6
|
6
|
7
|
3
|
|
California Institute of Technology
|
2
|
1
|
1
|
1
|
1
|
1
|
2
|
3
|
5
|
2
|
4
|
|
Massachusetts Institute of Technology
|
3
|
7
|
5
|
5
|
6
|
5
|
5
|
5
|
4
|
5
|
5
|
|
University of Cambridge
|
6
|
6
|
7
|
7
|
5
|
4
|
4
|
2
|
2
|
3
|
6
|
|
University of California, Berkeley
|
8
|
10
|
9
|
8
|
8
|
13
|
10
|
18
|
15
|
13
|
7
|
|
Yale University
|
10
|
11
|
11
|
11
|
9
|
12
|
12
|
12
|
8
|
8
|
8
|
|
Princeton University
|
5
|
5
|
6
|
6
|
7
|
7
|
7
|
7
|
7
|
6
|
9
|
|
University of Chicago
|
13
|
9
|
10
|
9
|
11
|
10
|
10
|
9
|
10
|
9
|
10
|
|
Imperial College London
|
9
|
8
|
8
|
10
|
9
|
8
|
8
|
8
|
9
|
10
|
11
|
|
|
V1
|
V2
|
V3
|
V4
|
V5
|
V6
|
V7
|
V8
|
V9
|
V10
|
V11
|
V12
|
|
46
|
Katholieke Universiteit Leuven
|
119
|
67
|
58
|
61
|
55
|
35
|
40
|
47
|
48
|
45
|
45
|
|
47
|
Universite Paris Sciences et Lettres
|
|
|
|
|
|
|
|
72
|
41
|
45
|
46
|
|
48
|
Nanyang Technological University
|
174
|
169
|
86
|
76
|
61
|
55
|
54
|
52
|
51
|
48
|
47
|
|
49
|
University of Illinois at Urbana–Champaign
|
33
|
31
|
33
|
29
|
29
|
36
|
36
|
37
|
50
|
48
|
48
|
|
50
|
University of Wisconsin-Madison
|
43
|
27
|
31
|
30
|
29
|
50
|
45
|
43
|
43
|
51
|
49
|
|
51
|
Washington University in St. Louis
|
38
|
41
|
44
|
42
|
42
|
60
|
57
|
50
|
54
|
52
|
50
|
Working on table
names(Edurank) <- c("Institution", "2011", "2012", "2013", "2014", "2015", "2016", "2017","2018", "2019", "2020", "2021")
Edurank <- Edurank[-1, ]
rownames(Edurank) <- NULL
head(Edurank, 10)
|
Institution
|
2011
|
2012
|
2013
|
2014
|
2015
|
2016
|
2017
|
2018
|
2019
|
2020
|
2021
|
|
University of Oxford
|
6
|
4
|
2
|
2
|
3
|
2
|
1
|
1
|
1
|
1
|
1
|
|
Stanford University
|
4
|
2
|
3
|
4
|
4
|
3
|
3
|
3
|
3
|
4
|
2
|
|
Harvard University
|
1
|
2
|
4
|
2
|
2
|
6
|
6
|
6
|
6
|
7
|
3
|
|
California Institute of Technology
|
2
|
1
|
1
|
1
|
1
|
1
|
2
|
3
|
5
|
2
|
4
|
|
Massachusetts Institute of Technology
|
3
|
7
|
5
|
5
|
6
|
5
|
5
|
5
|
4
|
5
|
5
|
|
University of Cambridge
|
6
|
6
|
7
|
7
|
5
|
4
|
4
|
2
|
2
|
3
|
6
|
|
University of California, Berkeley
|
8
|
10
|
9
|
8
|
8
|
13
|
10
|
18
|
15
|
13
|
7
|
|
Yale University
|
10
|
11
|
11
|
11
|
9
|
12
|
12
|
12
|
8
|
8
|
8
|
|
Princeton University
|
5
|
5
|
6
|
6
|
7
|
7
|
7
|
7
|
7
|
6
|
9
|
|
University of Chicago
|
13
|
9
|
10
|
9
|
11
|
10
|
10
|
9
|
10
|
9
|
10
|
|
|
Institution
|
2011
|
2012
|
2013
|
2014
|
2015
|
2016
|
2017
|
2018
|
2019
|
2020
|
2021
|
|
45
|
Katholieke Universiteit Leuven
|
119
|
67
|
58
|
61
|
55
|
35
|
40
|
47
|
48
|
45
|
45
|
|
46
|
Universite Paris Sciences et Lettres
|
|
|
|
|
|
|
|
72
|
41
|
45
|
46
|
|
47
|
Nanyang Technological University
|
174
|
169
|
86
|
76
|
61
|
55
|
54
|
52
|
51
|
48
|
47
|
|
48
|
University of Illinois at Urbana–Champaign
|
33
|
31
|
33
|
29
|
29
|
36
|
36
|
37
|
50
|
48
|
48
|
|
49
|
University of Wisconsin-Madison
|
43
|
27
|
31
|
30
|
29
|
50
|
45
|
43
|
43
|
51
|
49
|
|
50
|
Washington University in St. Louis
|
38
|
41
|
44
|
42
|
42
|
60
|
57
|
50
|
54
|
52
|
50
|
Saving table
write.csv(Edurank, file = "Uni-ranks.csv", row.names = FALSE)
df <- read.csv("Uni-ranks.csv", row.names = NULL, header=T,check.names=F)
head(df, 20)
|
Institution
|
2011
|
2012
|
2013
|
2014
|
2015
|
2016
|
2017
|
2018
|
2019
|
2020
|
2021
|
|
University of Oxford
|
6
|
4
|
2
|
2
|
3
|
2
|
1
|
1
|
1
|
1
|
1
|
|
Stanford University
|
4
|
2
|
3
|
4
|
4
|
3
|
3
|
3
|
3
|
4
|
2
|
|
Harvard University
|
1
|
2
|
4
|
2
|
2
|
6
|
6
|
6
|
6
|
7
|
3
|
|
California Institute of Technology
|
2
|
1
|
1
|
1
|
1
|
1
|
2
|
3
|
5
|
2
|
4
|
|
Massachusetts Institute of Technology
|
3
|
7
|
5
|
5
|
6
|
5
|
5
|
5
|
4
|
5
|
5
|
|
University of Cambridge
|
6
|
6
|
7
|
7
|
5
|
4
|
4
|
2
|
2
|
3
|
6
|
|
University of California, Berkeley
|
8
|
10
|
9
|
8
|
8
|
13
|
10
|
18
|
15
|
13
|
7
|
|
Yale University
|
10
|
11
|
11
|
11
|
9
|
12
|
12
|
12
|
8
|
8
|
8
|
|
Princeton University
|
5
|
5
|
6
|
6
|
7
|
7
|
7
|
7
|
7
|
6
|
9
|
|
University of Chicago
|
13
|
9
|
10
|
9
|
11
|
10
|
10
|
9
|
10
|
9
|
10
|
|
Imperial College London
|
9
|
8
|
8
|
10
|
9
|
8
|
8
|
8
|
9
|
10
|
11
|
|
Johns Hopkins University
|
13
|
14
|
16
|
15
|
15
|
11
|
17
|
13
|
12
|
12
|
12
|
|
University of Pennsylvania
|
1
|
16
|
15
|
16
|
16
|
17
|
13
|
10
|
12
|
11
|
13
|
|
Swiss Federal Institute of Technology in Zurich
|
15
|
15
|
12
|
14
|
13
|
9
|
9
|
10
|
11
|
13
|
14
|
|
University of California, Los Angeles
|
11
|
13
|
13
|
12
|
12
|
16
|
14
|
15
|
17
|
17
|
15
|
|
University College London
|
22
|
17
|
17
|
21
|
22
|
14
|
15
|
16
|
14
|
15
|
16
|
|
Columbia University
|
18
|
12
|
14
|
13
|
14
|
15
|
16
|
14
|
16
|
16
|
17
|
|
University of Toronto
|
17
|
19
|
21
|
20
|
20
|
19
|
22
|
22
|
21
|
18
|
18
|
|
Cornell University
|
14
|
20
|
18
|
19
|
19
|
18
|
19
|
19
|
19
|
19
|
19
|
|
Duke University
|
24
|
22
|
23
|
17
|
18
|
20
|
18
|
17
|
18
|
20
|
20
|
## 'data.frame': 50 obs. of 12 variables:
## $ Institution: chr "University of Oxford" "Stanford University" "Harvard University" "California Institute of Technology" ...
## $ 2011 : chr "6" "4" "1" "2" ...
## $ 2012 : chr "4" "2" "2" "1" ...
## $ 2013 : chr "2" "3" "4" "1" ...
## $ 2014 : chr "2" "4" "2" "1" ...
## $ 2015 : chr "3" "4" "2" "1" ...
## $ 2016 : chr "2" "3" "6" "1" ...
## $ 2017 : chr "1" "3" "6" "2" ...
## $ 2018 : int 1 3 6 3 5 2 18 12 7 9 ...
## $ 2019 : int 1 3 6 5 4 2 15 8 7 10 ...
## $ 2020 : int 1 4 7 2 5 3 13 8 6 9 ...
## $ 2021 : int 1 2 3 4 5 6 7 8 9 10 ...
Manipulating data
df1= df %>% dplyr::select(Institution, `2011`,`2021` )
df1$`2011`=as.numeric(as.character(df1$`2011`))
df1$`2021`=as.numeric(as.character(df1$`2021`))
df1 = df1 %>% dplyr::arrange((`2021`))
df1.1 = head(df1, 10)
Building a slopegraph
MySpecial <- list(
scale_x_discrete(position = "top"),
theme_bw(),
theme(legend.position = "none"),
theme(panel.border = element_blank()),
theme(axis.title.y = element_blank()),
theme(axis.text.y = element_blank()),
theme(panel.grid.major.y = element_blank()),
theme(panel.grid.minor.y = element_blank()),
theme(axis.title.x = element_blank()),
theme(panel.grid.major.x = element_blank()),
theme(axis.text.x.top = element_text(size=12)),
theme(axis.ticks = element_blank()),
theme(plot.title = element_text(size=14, face = "bold", hjust = 0.5)),
theme(plot.subtitle = element_text(hjust = 0.5))
)
ggplot(data = df2, aes(x = year, y = rang, group = Institution)) +
geom_line(aes(color = Institution, alpha = 1), size = 1) +
geom_text_repel(data = df2 %>% dplyr::filter(year == "2011"),
aes(label = Institution) ,
hjust = "left",
fontface = "bold",
size = 3,
nudge_x = -.45,
direction = "y") +
geom_text_repel(data = df2 %>% dplyr::filter(year == "2021"),
aes(label = Institution) ,
hjust = "right",
fontface = "bold",
size = 3,
nudge_x = .5,
direction = "y") +
geom_label(aes(label = rang),
size = 2.5,
label.padding = unit(0.05, "lines"),
label.size = 0.0) +
MySpecial +
labs(
title = "Trends in rankings for TOP-10 universities 2011-2021",
subtitle = "Most universities have imroved their ranks by 10 years",
caption = "https://en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings"
)
