Scraping data

library(httr)
library(ggplot2)
library(kableExtra)
library(ggrepel)
library(XML)

di <- GET("https://en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings")
di

## Response [https://en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings]
##   Date: 2020-12-12 12:49
##   Status: 200
##   Content-Type: text/html; charset=UTF-8
##   Size: 274 kB
## <!DOCTYPE html>
## <html class="client-nojs" lang="en" dir="ltr">
## <head>
## <meta charset="UTF-8"/>
## <title>Times Higher Education World University Rankings - Wikipedia</title>
## <script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames...
## "Articles with short description","Short description matches Wikidata","Use B...
## "wgULSCurrentAutonym":"English","wgNoticeProject":"wikipedia","wgCentralAuthM...
## "ext.gadget.switcher","ext.centralauth.centralautologin","mmv.head","mmv.boot...
## <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.option...
## ...

doc <- readHTMLTable(
  doc = content(di, "text"))
str(doc)

## List of 7
##  $ Times Higher Education World University Rankings                               :'data.frame': 9 obs. of  2 variables:
##   ..$ V1: chr [1:9] "" "Editor" "Categories" "Frequency" ...
##   ..$ V2: chr [1:9] NA "Phil Baty" "Higher education" "Annual" ...
##  $ NULL                                                                           :'data.frame': 6 obs. of  3 variables:
##   ..$ V1: chr [1:6] "Overall indicator" "Industry Income – innovation" "International diversity" "Teaching – the learning environment" ...
##   ..$ V2: chr [1:6] "Individual indicator" "Research income from industry (per academic staff)" "Ratio of international to domestic staff\nRatio of international to domestic students" "Reputational survey (teaching)\nPhDs awards per academic\nUndergrad. admitted per academic\nIncome per academic"| __truncated__ ...
##   ..$ V3: chr [1:6] "Percentage weighting" "2.5%" "3%\n2%" "15%\n6%\n4.5%\n2.25%\n2.25%" ...
##  $ Times Higher Education World University Rankings—Top 50[Note 1]                :'data.frame': 51 obs. of  12 variables:
##   ..$ V1 : chr [1:51] "Institution" "University of Oxford" "Stanford University" "Harvard University" ...
##   ..$ V2 : chr [1:51] "2010–11[42]" "6" "4" "1" ...
##   ..$ V3 : chr [1:51] "2011–12[43]" "4" "2" "2" ...
##   ..$ V4 : chr [1:51] "2012–13[44]" "2" "3" "4" ...
##   ..$ V5 : chr [1:51] "2013–14[45]" "2" "4" "2" ...
##   ..$ V6 : chr [1:51] "2014–15[46]" "3" "4" "2" ...
##   ..$ V7 : chr [1:51] "2015–16[47]" "2" "3" "6" ...
##   ..$ V8 : chr [1:51] "2016–17[48]" "1" "3" "6" ...
##   ..$ V9 : chr [1:51] "2017-18[49]" "1" "3" "6" ...
##   ..$ V10: chr [1:51] "2018–19[50]" "1" "3" "6" ...
##   ..$ V11: chr [1:51] "2019–20[51]" "1" "4" "7" ...
##   ..$ V12: chr [1:51] "2020–21[52]" "1" "2" "3" ...
##  $ Times Higher Education World Reputation Rankings—Top 25[Note 1]                :'data.frame': 26 obs. of  11 variables:
##   ..$ V1 : chr [1:26] "Institution" "Harvard University" "Massachusetts Institute of Technology" "Stanford University" ...
##   ..$ V2 : chr [1:26] "2011[57]" "1" "2" "5" ...
##   ..$ V3 : chr [1:26] "2012[58]" "1" "2" "4" ...
##   ..$ V4 : chr [1:26] "2013[59]" "1" "2" "6" ...
##   ..$ V5 : chr [1:26] "2014[60]" "1" "2" "3" ...
##   ..$ V6 : chr [1:26] "2015[61]" "1" "4" "5" ...
##   ..$ V7 : chr [1:26] "2016[62]" "1" "2" "3" ...
##   ..$ V8 : chr [1:26] "2017[63]" "1" "2" "3" ...
##   ..$ V9 : chr [1:26] "2018[64]" "1" "2" "3" ...
##   ..$ V10: chr [1:26] "2019[65]" "1" "2" "3" ...
##   ..$ V11: chr [1:26] "2020[66]" "1" "2" "3" ...
##  $ Times Higher Education Asia University Rankings as shown below – Top 20[Note 1]:'data.frame': 21 obs. of  9 variables:
##   ..$ V1: chr [1:21] "Institution" "Tsinghua University" "Peking University" "National University of Singapore" ...
##   ..$ V2: chr [1:21] "2013[68]" "6" "4" "2" ...
##   ..$ V3: chr [1:21] "2014[69]" "6" "5" "2" ...
##   ..$ V4: chr [1:21] "2015[70]" "5" "4" "2" ...
##   ..$ V5: chr [1:21] "2016[67]" "5" "2" "1" ...
##   ..$ V6: chr [1:21] "2017[71]" "3" "2" "1" ...
##   ..$ V7: chr [1:21] "2018[72]" "2" "3" "1" ...
##   ..$ V8: chr [1:21] "2019[73]" "1" "5" "2" ...
##   ..$ V9: chr [1:21] "2020[74]" "1" "2" "3" ...
##  $ Times Higher Education BRICS & Emerging Economies Rankings – Top 20[Note 1]    :'data.frame': 21 obs. of  8 variables:
##   ..$ V1: chr [1:21] "Institution" "Tsinghua University" "Peking University" "Zhejiang University" ...
##   ..$ V2: chr [1:21] "2014[75]" "2" "1" "22" ...
##   ..$ V3: chr [1:21] "2015[76]" "2" "1" "21" ...
##   ..$ V4: chr [1:21] "2016[77]" "2" "1" "8" ...
##   ..$ V5: chr [1:21] "2017[78]" "2" "1" "9" ...
##   ..$ V6: chr [1:21] "2018[79]" "2" "1" "6" ...
##   ..$ V7: chr [1:21] "2019[80]" "1" "2" "3" ...
##   ..$ V8: chr [1:21] "2020[81]" "1" "2" "3" ...
##  $ NULL                                                                           :'data.frame': 6 obs. of  2 variables:
##   ..$ V1: chr [1:6] ".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{fl"| __truncated__ "Global" "Regional" "National" ...
##   ..$ V2: chr [1:6] NA "Academic Ranking of World Universities\nCWTS Leiden Ranking\nGlobal University Ranking\nMines ParisTech: Profes"| __truncated__ "European Union\nLatin America\nSouth East Asia" "Australia\nExcellence in Research for Australia\nBrazil\nCanada\nMaclean's\nChina\nBCUR\nWu Shulian\nCUAA\nNetb"| __truncated__ ...

Edurank <- doc[[3]]

Edurank[1:12, ] %>%
  kable() %>%
  kable_styling()

V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12
Institution	2010–11[42]	2011–12[43]	2012–13[44]	2013–14[45]	2014–15[46]	2015–16[47]	2016–17[48]	2017-18[49]	2018–19[50]	2019–20[51]	2020–21[52]
University of Oxford	6	4	2	2	3	2	1	1	1	1	1
Stanford University	4	2	3	4	4	3	3	3	3	4	2
Harvard University	1	2	4	2	2	6	6	6	6	7	3
California Institute of Technology	2	1	1	1	1	1	2	3	5	2	4
Massachusetts Institute of Technology	3	7	5	5	6	5	5	5	4	5	5
University of Cambridge	6	6	7	7	5	4	4	2	2	3	6
University of California, Berkeley	8	10	9	8	8	13	10	18	15	13	7
Yale University	10	11	11	11	9	12	12	12	8	8	8
Princeton University	5	5	6	6	7	7	7	7	7	6	9
University of Chicago	13	9	10	9	11	10	10	9	10	9	10
Imperial College London	9	8	8	10	9	8	8	8	9	10	11

tail(Edurank)

	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12
46	Katholieke Universiteit Leuven	119	67	58	61	55	35	40	47	48	45	45
47	Universite Paris Sciences et Lettres								72	41	45	46
48	Nanyang Technological University	174	169	86	76	61	55	54	52	51	48	47
49	University of Illinois at Urbana–Champaign	33	31	33	29	29	36	36	37	50	48	48
50	University of Wisconsin-Madison	43	27	31	30	29	50	45	43	43	51	49
51	Washington University in St. Louis	38	41	44	42	42	60	57	50	54	52	50

Working on table

names(Edurank) <- c("Institution", "2011", "2012", "2013",  "2014", "2015", "2016", "2017","2018", "2019", "2020", "2021")
Edurank <- Edurank[-1, ]

rownames(Edurank) <- NULL
head(Edurank, 10)

Institution	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021
University of Oxford	6	4	2	2	3	2	1	1	1	1	1
Stanford University	4	2	3	4	4	3	3	3	3	4	2
Harvard University	1	2	4	2	2	6	6	6	6	7	3
California Institute of Technology	2	1	1	1	1	1	2	3	5	2	4
Massachusetts Institute of Technology	3	7	5	5	6	5	5	5	4	5	5
University of Cambridge	6	6	7	7	5	4	4	2	2	3	6
University of California, Berkeley	8	10	9	8	8	13	10	18	15	13	7
Yale University	10	11	11	11	9	12	12	12	8	8	8
Princeton University	5	5	6	6	7	7	7	7	7	6	9
University of Chicago	13	9	10	9	11	10	10	9	10	9	10

tail(Edurank)

	Institution	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021
45	Katholieke Universiteit Leuven	119	67	58	61	55	35	40	47	48	45	45
46	Universite Paris Sciences et Lettres								72	41	45	46
47	Nanyang Technological University	174	169	86	76	61	55	54	52	51	48	47
48	University of Illinois at Urbana–Champaign	33	31	33	29	29	36	36	37	50	48	48
49	University of Wisconsin-Madison	43	27	31	30	29	50	45	43	43	51	49
50	Washington University in St. Louis	38	41	44	42	42	60	57	50	54	52	50

Saving table

write.csv(Edurank, file = "Uni-ranks.csv", row.names = FALSE)

df <- read.csv("Uni-ranks.csv", row.names = NULL, header=T,check.names=F) 
head(df, 20)

Institution	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020	2021
University of Oxford	6	4	2	2	3	2	1	1	1	1	1
Stanford University	4	2	3	4	4	3	3	3	3	4	2
Harvard University	1	2	4	2	2	6	6	6	6	7	3
California Institute of Technology	2	1	1	1	1	1	2	3	5	2	4
Massachusetts Institute of Technology	3	7	5	5	6	5	5	5	4	5	5
University of Cambridge	6	6	7	7	5	4	4	2	2	3	6
University of California, Berkeley	8	10	9	8	8	13	10	18	15	13	7
Yale University	10	11	11	11	9	12	12	12	8	8	8
Princeton University	5	5	6	6	7	7	7	7	7	6	9
University of Chicago	13	9	10	9	11	10	10	9	10	9	10
Imperial College London	9	8	8	10	9	8	8	8	9	10	11
Johns Hopkins University	13	14	16	15	15	11	17	13	12	12	12
University of Pennsylvania	1	16	15	16	16	17	13	10	12	11	13
Swiss Federal Institute of Technology in Zurich	15	15	12	14	13	9	9	10	11	13	14
University of California, Los Angeles	11	13	13	12	12	16	14	15	17	17	15
University College London	22	17	17	21	22	14	15	16	14	15	16
Columbia University	18	12	14	13	14	15	16	14	16	16	17
University of Toronto	17	19	21	20	20	19	22	22	21	18	18
Cornell University	14	20	18	19	19	18	19	19	19	19	19
Duke University	24	22	23	17	18	20	18	17	18	20	20

str(df)

## 'data.frame':    50 obs. of  12 variables:
##  $ Institution: chr  "University of Oxford" "Stanford University" "Harvard University" "California Institute of Technology" ...
##  $ 2011       : chr  "6" "4" "1" "2" ...
##  $ 2012       : chr  "4" "2" "2" "1" ...
##  $ 2013       : chr  "2" "3" "4" "1" ...
##  $ 2014       : chr  "2" "4" "2" "1" ...
##  $ 2015       : chr  "3" "4" "2" "1" ...
##  $ 2016       : chr  "2" "3" "6" "1" ...
##  $ 2017       : chr  "1" "3" "6" "2" ...
##  $ 2018       : int  1 3 6 3 5 2 18 12 7 9 ...
##  $ 2019       : int  1 3 6 5 4 2 15 8 7 10 ...
##  $ 2020       : int  1 4 7 2 5 3 13 8 6 9 ...
##  $ 2021       : int  1 2 3 4 5 6 7 8 9 10 ...

Manipulating data

df1= df %>% dplyr::select(Institution, `2011`,`2021` ) 

df1$`2011`=as.numeric(as.character(df1$`2011`))
df1$`2021`=as.numeric(as.character(df1$`2021`))
df1 = df1 %>% dplyr::arrange((`2021`))
df1.1 = head(df1, 10)

Changing data format

library(reshape2)
d <- melt(df1.1, id.vars = 'Institution')

d$year = d$variable
d$rang = d$value
df2=d %>% dplyr::select(Institution, rang, year)
df2$rang = as.numeric(as.character(df2$rang))
df2$year = as.factor(df2$year)

Building a slopegraph

MySpecial <- list(  
  scale_x_discrete(position = "top"),
  theme_bw(),
  theme(legend.position = "none"),
  theme(panel.border     = element_blank()),
  theme(axis.title.y     = element_blank()),
  theme(axis.text.y      = element_blank()),
  theme(panel.grid.major.y = element_blank()),
  theme(panel.grid.minor.y = element_blank()),
  theme(axis.title.x     = element_blank()),
  theme(panel.grid.major.x = element_blank()),
  theme(axis.text.x.top      = element_text(size=12)),
  theme(axis.ticks       = element_blank()),
  theme(plot.title       = element_text(size=14, face = "bold", hjust = 0.5)),
  theme(plot.subtitle    = element_text(hjust = 0.5))
)

ggplot(data = df2, aes(x = year, y = rang, group = Institution)) +
  geom_line(aes(color = Institution, alpha = 1), size = 1) +
  geom_text_repel(data = df2 %>% dplyr::filter(year == "2011"), 
                  aes(label = Institution) , 
                  hjust = "left", 
                  fontface = "bold", 
                  size = 3, 
                  nudge_x = -.45, 
                  direction = "y") +
  geom_text_repel(data = df2 %>% dplyr::filter(year == "2021"), 
                  aes(label = Institution) , 
                  hjust = "right", 
                  fontface = "bold", 
                  size = 3, 
                  nudge_x = .5, 
                  direction = "y") +
  geom_label(aes(label = rang), 
             size = 2.5, 
             label.padding = unit(0.05, "lines"), 
             label.size = 0.0) +
  MySpecial +
  labs(
    title = "Trends in rankings for TOP-10 universities 2011-2021",
    subtitle = "Most universities have imroved their ranks by 10 years",
    caption = "https://en.wikipedia.org/wiki/Times_Higher_Education_World_University_Rankings"
  )

Internet Database

Anastasia Bakhareva

07.10.2020

Scraping data

Working on table

Saving table

Manipulating data

Changing data format

Building a slopegraph