Best Global Universities: An Application of Using try() Funtion

R for Pleasure

Nguyen Chi Dung

An Explanation of try() Function

It’s often the case that we encounter warnings or errors when running R commands. In the case of errors, execution of a function or a series of commands can get halted when an error occurs, which can in some cases be frustrating especially if we want to continue our next commands. There are various functions available in R for dealing with errors and in this post we will consider some basic examples of how to make use of the try function. Here’s a toy example:

#==================================
#  Explanation of try() function
#==================================

# Create our inputs: 
my_inputs1 <- c(2, 4, 5, 10, 100)

# Calculate log and print results: 
for (input in my_inputs1) {
  result <- log(input)
  message_output <- paste("Log of", input, "is", round(result, 3))
  print(message_output)
}
## [1] "Log of 2 is 0.693"
## [1] "Log of 4 is 1.386"
## [1] "Log of 5 is 1.609"
## [1] "Log of 10 is 2.303"
## [1] "Log of 100 is 4.605"

In case of a non-numeric value, the loop crashed on the non-numeric argument and does not finish all calculations. The loop returns an error message Error in log(input) : non-numeric argument to mathematical function:

my_inputs2 <- c(2, 4, "My name is Dung", 10, 100)

for (input in my_inputs2) {
  result <- log(input)
  message_output <- paste("Log of", input, "is", round(result, 3))
  print(message_output)
}

If we want to finish all calculations for numeric input and bypass non-numeric one, we can use try() function:

for (input in my_inputs2) {
  
  # Bypass non-numeric input by using try() function: 
  try(result <- log(input))
  message_output <- paste("Log of", input, "is", round(result, 3))
  print(message_output)
}

A Practical Application: Scraping Data For The Best Global Universities

#===========================================================================
#                       A Practical Application: 
#  R codes for colecting data from best global universities 
#  from https://www.usnews.com/education/best-global-universities/rankings
#===========================================================================

# Load some package: 

library(rvest)
library(tidyverse)
library(magrittr)

# Function gets all university links from a page: 

get_link_from_page <- function(x) {
  x %>% 
    read_html(x) %>% 
    html_nodes("a") %>% 
    html_attr("href") -> k
  
  k1 <- k[str_detect(k, "https://www.usnews.com/education/best-global-universities")]
  return(k1[str_detect(k1, "[0-9$]")])
  
}

# Get all links from 30 pages: 
all_links <- lapply(paste0("https://www.usnews.com/education/best-global-universities/rankings?page=", 1:30), get_link_from_page) %>% 
  unlist()

# A function collects data for an university: 

get_data_for_university <- function(x) {
  
  x %>% 
    read_html() -> html_content
  
  html_content %>% 
    html_nodes('.t-slack.sep div') %>% 
    html_text() %>% 
    str_squish() -> my_raw_data
  
  my_raw_data %>% 
    matrix(ncol = 2, byrow = TRUE) %>% 
    as.data.frame() %>% 
    mutate_all(as.character) -> raw_df1
  
 
  html_content %>% 
    html_nodes('.thumb-left') %>% 
    html_text() %>% 
    str_squish() -> rank_field
  
  rank_field[-c(1:2)] -> rank_field
  
  html_content %>% 
    html_nodes('.t-large a') %>% 
    html_text() %>% 
    str_squish() -> field_name
  
  raw_df2 <- data_frame(field = field_name, rank = rank_field)
  raw_df2 %>% 
    spread(value = "rank", key = "field") -> raw_df2_wide
  
  raw_df1 %>%  
    spread(value = "V1", key = "V2") -> raw_df1_wide
  
  # Get Uni Name + address: 
  
  html_content %>% 
    html_nodes('.h-biggest') %>% 
    html_text() %>% 
    str_squish() -> uni_name
  
  html_content %>% 
    html_nodes('.clearfix .t-slack:nth-child(3)') %>% 
    html_text() %>% 
    str_squish() -> uni_add
  
  full_join(raw_df1_wide %>% mutate(uni_name = uni_name), 
            raw_df2_wide %>% mutate(uni_name = uni_name, uni_add = uni_add), 
            by = "uni_name") -> df_for_university
  
  return(df_for_university)
  
}

In order to colect data and bypass cases of unavailabel links we should use try() function in conjunction with for loop:

# Use above function for collecting data for 2 universities: 
for (link in all_links[1:2]) {
  try(my_df <- get_data_for_university(link))
} 

For convenience, we can use the lapply() for collecting data on all 300 universities with caution of unavailable links:

# Use above function for collecting data for 300 universities: 
all_data_for_uni <- lapply(all_links[1:300], get_data_for_university)

Data Pre-processing:

#========================
#  Data Pre-processing
#========================


name_all <- lapply(all_data_for_uni, names) %>% unlist() %>% unique()
n_names <- length(name_all)

all_universities_df_full <- data.frame()

for (i in 1:length(all_data_for_uni)) {
  
  df_i <- all_data_for_uni[[i]]
  col_names_for_df <- names(df_i)
  dif_names <- setdiff(name_all, col_names_for_df)
  value <- rep(NA, length(dif_names))
  
  matrix(c(dif_names, value), ncol = length(dif_names), byrow = TRUE) %>% 
    as.data.frame() -> m 
  
  names(m) <- dif_names
  m %<>% slice(-1)
  m %<>% mutate(uni_name = df_i$uni_name)
  df_i_new <- merge(df_i, m, by = "uni_name")
  all_universities_df_full <- rbind(all_universities_df_full, df_i_new)

}


all_universities_df_full %>% 
  select(`Total number of students`, `Total number of academic staff`, 
         `Number of doctoral degrees awarded`, `Number of international staff`, 
         `Number of new doctoral students`, `Number of international students`,
         `Global score`, `Number of research only staff`) %>% 
  mutate_all(function(x) {x %>% str_replace_all("[^0-9]", "") %>% as.numeric()}) %>% 
  mutate(uni_name = all_universities_df_full$uni_name) -> df1

names(df1) <- str_replace_all(names(df1), " ", "_")

df1 %<>%  
  mutate(Global_score = Global_score / 10, 
         ratio = Number_of_research_only_staff / Total_number_of_academic_staff) 

#============================
#    Data Visualization 
#============================

# Make a draft plot: 

library(ggrepel)

df1 %>% 
  ggplot(aes(Number_of_research_only_staff, Global_score)) + 
  geom_point(alpha = 0.5, size = 5, color = "firebrick") + 
  geom_text_repel(data = df1 %>% filter(Number_of_research_only_staff >= 2000), 
                  aes(label = uni_name), force = 19, size = 3, color = "gray35", family = "Roboto") + 
  geom_smooth(method = "lm", fill = "orange", color = "gray50", alpha = 0.2) + 
  theme_minimal() + 
  theme(plot.background = element_rect(fill = "#f5f5f2", color = NA)) + 
  theme(plot.title = element_text(family = "Roboto", size = 17)) +
  theme(plot.subtitle = element_text(family = "Roboto", size = 10, colour = "gray30")) + 
  theme(plot.caption = element_text(family = "Roboto", size = 10, color = "gray50")) + 
  # theme(panel.grid.minor = element_blank()) + 
  theme(axis.text = element_text(family = "Roboto", size = 9, face = "bold", color = "gray50")) + 
  theme(axis.title = element_text(family = "Roboto", size = 11, face = "bold", colour = "gray30")) + 
  scale_x_continuous(limits = c(0, 6000)) + 
  
  labs(x = "Number of Research only Staff", y = "Global Score",
       title = "The Relationship Between University Ranking and Number of Research Staff",
       subtitle = "Research Staff includes teachers, lecturers, researchers, scholars and professors of different academic ranks. Most of them\nhold a Ph.D or equivalent highest-level degree in their field and their main function is to carry out research activities",
       caption = "Source: https://www.usnews.com")