Web Scraping in R

#loading the package
library(xml2)
library(rvest)
library(stringr)

#Specifying the url for desired website to be scrapped
url <- "https://www.topuniversities.com/universities/universiti-malaya-um"

#Reading the html content from topuniversities
webpage <- read_html(url)

#scrape University Name
University_Name_html <- html_nodes(webpage, "h1")
University_Name <- html_text(University_Name_html)
head(University_Name)

## [1] "Universiti Malaya (UM)"

#scrape Location
Location_html <- html_nodes(webpage, "#uni_details_content .location")
Location <- html_text(Location_html[1])
Location <- str_replace_all(Location, "[\r\n]" , "")
head(Location)

## [1] "Kuala Lumpur Campus, Kuala Lumpur Malaysia"

#scrape QS World University Rankings
QS_World_University_Rankings_html <- html_nodes(webpage, ".uni_ranking span")
QS_World_University_Rankings <- html_text(QS_World_University_Rankings_html)
head(QS_World_University_Rankings)

## [1] "65"

#scrape QS stars
QS_Stars_html <- html_nodes(webpage, ".uni_str span")
QS_Stars <- html_text(QS_Stars_html)
head(QS_Stars)

## [1] "5"

#scrape Status
Status_html <- html_nodes(webpage, "li:nth-child(1) .info-setails")
Status <- html_text(Status_html)
head(Status)

## [1] "Public"

#scrape Research Output
Research_Output_html <- html_nodes(webpage, "li:nth-child(2) .info-setails")
Research_Output <- html_text(Research_Output_html)
head(Research_Output)

## [1] "Very High"

#scrape Student/Faculty Ratio
Student_Faculty_Ratio_html <- html_nodes(webpage, "li:nth-child(3) .info-setails")
Student_Faculty_Ratio <- html_text(Student_Faculty_Ratio_html)
head(Student_Faculty_Ratio)

## [1] "8"

#scrape International Students
International_Students_html <- html_nodes(webpage, "li:nth-child(4) .info-setails")
International_Students <- html_text(International_Students_html)
International_Students <- as.numeric(gsub(",","", International_Students))
head(International_Students)

## [1] 3340

#scrape Size
Size_html <- html_nodes(webpage, "li:nth-child(5) .info-setails")
Size <- html_text(Size_html)
head(Size)

## [1] "L"

#scrape Total Faculty
Total_Faculty_html <- html_nodes(webpage, "li:nth-child(6) .info-setails")
Total_Faculty <- html_text(Total_Faculty_html)
Total_Faculty <- as.numeric(gsub(",","", Total_Faculty))
head(Total_Faculty)

## [1] 2386

#Combining all the lists to form a data frame
university_data <- data.frame(University_Name = University_Name, Location = Location,QS_World_University_Rankings = QS_World_University_Rankings, QS_Stars = QS_Stars, Status = Status, Research_Output = Research_Output, Student_Faculty_Ratio = Student_Faculty_Ratio, International_Students = International_Students, Size = Size, Total_Faculty = Total_Faculty)

#Structure of the data frame
str(university_data)

## 'data.frame':    1 obs. of  10 variables:
##  $ University_Name             : chr "Universiti Malaya (UM)"
##  $ Location                    : chr "Kuala Lumpur Campus, Kuala Lumpur Malaysia"
##  $ QS_World_University_Rankings: chr "65"
##  $ QS_Stars                    : chr "5"
##  $ Status                      : chr "Public"
##  $ Research_Output             : chr "Very High"
##  $ Student_Faculty_Ratio       : chr "8"
##  $ International_Students      : num 3340
##  $ Size                        : chr "L"
##  $ Total_Faculty               : num 2386

# Include ‘jsonlite’ library to convert in JSON form.
library(jsonlite)

# convert dataframe into JSON format
json_data <- toJSON(university_data)

# print output
cat(json_data)

## [{"University_Name":"Universiti Malaya (UM)","Location":"Kuala Lumpur Campus, Kuala Lumpur Malaysia","QS_World_University_Rankings":"65","QS_Stars":"5","Status":"Public","Research_Output":"Very High","Student_Faculty_Ratio":"8","International_Students":3340,"Size":"L","Total_Faculty":2386}]

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Web Scraping in R

Liu Qiyi

2021/11/18

R Markdown

Including Plots