#loading the package
library(xml2)
library(rvest)
library(stringr)
#Specifying the url for desired website to be scrapped
url <- "https://www.topuniversities.com/universities/universiti-malaya-um"
#Reading the html content from topuniversities
webpage <- read_html(url)
#scrape University Name
University_Name_html <- html_nodes(webpage, "h1")
University_Name <- html_text(University_Name_html)
head(University_Name)
## [1] "Universiti Malaya (UM)"
#scrape Location
Location_html <- html_nodes(webpage, "#uni_details_content .location")
Location <- html_text(Location_html[1])
Location <- str_replace_all(Location, "[\r\n]" , "")
head(Location)
## [1] "Kuala Lumpur Campus, Kuala Lumpur Malaysia"
#scrape QS World University Rankings
QS_World_University_Rankings_html <- html_nodes(webpage, ".uni_ranking span")
QS_World_University_Rankings <- html_text(QS_World_University_Rankings_html)
head(QS_World_University_Rankings)
## [1] "65"
#scrape QS stars
QS_Stars_html <- html_nodes(webpage, ".uni_str span")
QS_Stars <- html_text(QS_Stars_html)
head(QS_Stars)
## [1] "5"
#scrape Status
Status_html <- html_nodes(webpage, "li:nth-child(1) .info-setails")
Status <- html_text(Status_html)
head(Status)
## [1] "Public"
#scrape Research Output
Research_Output_html <- html_nodes(webpage, "li:nth-child(2) .info-setails")
Research_Output <- html_text(Research_Output_html)
head(Research_Output)
## [1] "Very High"
#scrape Student/Faculty Ratio
Student_Faculty_Ratio_html <- html_nodes(webpage, "li:nth-child(3) .info-setails")
Student_Faculty_Ratio <- html_text(Student_Faculty_Ratio_html)
head(Student_Faculty_Ratio)
## [1] "8"
#scrape International Students
International_Students_html <- html_nodes(webpage, "li:nth-child(4) .info-setails")
International_Students <- html_text(International_Students_html)
International_Students <- as.numeric(gsub(",","", International_Students))
head(International_Students)
## [1] 3340
#scrape Size
Size_html <- html_nodes(webpage, "li:nth-child(5) .info-setails")
Size <- html_text(Size_html)
head(Size)
## [1] "L"
#scrape Total Faculty
Total_Faculty_html <- html_nodes(webpage, "li:nth-child(6) .info-setails")
Total_Faculty <- html_text(Total_Faculty_html)
Total_Faculty <- as.numeric(gsub(",","", Total_Faculty))
head(Total_Faculty)
## [1] 2386
#Combining all the lists to form a data frame
university_data <- data.frame(University_Name = University_Name, Location = Location,QS_World_University_Rankings = QS_World_University_Rankings, QS_Stars = QS_Stars, Status = Status, Research_Output = Research_Output, Student_Faculty_Ratio = Student_Faculty_Ratio, International_Students = International_Students, Size = Size, Total_Faculty = Total_Faculty)
#Structure of the data frame
str(university_data)
## 'data.frame': 1 obs. of 10 variables:
## $ University_Name : chr "Universiti Malaya (UM)"
## $ Location : chr "Kuala Lumpur Campus, Kuala Lumpur Malaysia"
## $ QS_World_University_Rankings: chr "65"
## $ QS_Stars : chr "5"
## $ Status : chr "Public"
## $ Research_Output : chr "Very High"
## $ Student_Faculty_Ratio : chr "8"
## $ International_Students : num 3340
## $ Size : chr "L"
## $ Total_Faculty : num 2386
# Include ‘jsonlite’ library to convert in JSON form.
library(jsonlite)
# convert dataframe into JSON format
json_data <- toJSON(university_data)
# print output
cat(json_data)
## [{"University_Name":"Universiti Malaya (UM)","Location":"Kuala Lumpur Campus, Kuala Lumpur Malaysia","QS_World_University_Rankings":"65","QS_Stars":"5","Status":"Public","Research_Output":"Very High","Student_Faculty_Ratio":"8","International_Students":3340,"Size":"L","Total_Faculty":2386}]
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.