Webscraping Tool: Base R & Dplyr Approach

1 Advanced R Project: WebScraping Tool
- 1.1 Libraries used:
2 Scraping Function: Base R
3 Scraping Function: Dplyr
4 Benchmarking

1 Advanced R Project: WebScraping Tool

The aim of this project is to build a webscraping function in R based on two different approaches. First one uses mostly Base R, and the second one use Dplyr. We want to compare the time effinciency of both functions, to see if there is a difference of speed between these types of programming.

Our WebScraping Tool, scrapes the basic information from polish movie database Filmweb and performs basic EDA analysisto allow for visual validation of the results. One can choose another website to scrape with this function, yet in such a case, one needs to provide a new starting URL and define “nodes” to scrape from the website. Our function is pretty intuitive and easy to understand, so we think for someone familiar with R, it will be easy to use.

1.1 Libraries used:

library(stringi)
library(stringr)
library(tidyverse)
library(xml2)
library(rvest)
library(ggplot2)
library(gridExtra)
library(microbenchmark)
library(dplyr)

2 Scraping Function: Base R

# Define the function
scrappy = function(number_of_pages,url, silent = FALSE){
  
  # Check the validity of arguments
  if(is.na(url) | is.na(number_of_pages)){
    stop("Provide at least something...")
  }
  
  # Ask the user if he wants to conver the variable
  if(!is.numeric(number_of_pages) | number_of_pages==0){
    print("Unable to perform scraping because you provided incorrect input, make sure number of pages is an integer (not 0)!")
    statement <- readline("Convert it for you? (Yes/No) > ")
    
    if(statement == "Yes"){
      number_of_pages <- as.integer(round(number_of_pages, digits = 0))
    } 
    else{
      stop("The function will halt.")
    }
  }    
  
  tryCatch({
    
    # Make the page strings
    list_of_pages <- str_c(url, '&page=', 2:number_of_pages)
    data_base = tibble()
    
    # Iterate over the pages
    for(i in 1:number_of_pages){
      
      # Read HTML
      baza_filmow = read_html(list_of_pages[i])
      
      get_title_html = html_nodes(baza_filmow,".filmPreview__title")
      get_rating_html = html_nodes(baza_filmow,".rateBox__rate")
      get_genre_html = html_nodes(baza_filmow,".filmPreview__info--genres")
      get_country_html = html_nodes(baza_filmow,".filmPreview__info--countries")
      get_seen_html = html_nodes(baza_filmow,".rateBox__votes")
      get_iwanttosee_html = html_nodes(baza_filmow,".wantToSee__count")
      
      # Delete unnecessary strings
      remove = html_text(get_seen_html)
      remove_first = stri_replace_all(remove,"",fixed="głosy")
      seen = stri_replace_all(remove_first,"",fixed="głosow")
      seen = gsub(" ", "", seen, fixed = TRUE)
      remove_second = html_text(get_genre_html)
      genre = stri_replace_all(remove_second,"",fixed="gatunek:")
      remove_third = html_text(get_country_html)
      country = stri_replace_all(remove_third,"",fixed="kraj:")
      remove_fourth = html_text(get_rating_html)
      rating = stri_replace_all(remove_fourth,".",fixed=",")
      rating_second = stri_replace_all(rating,"",fixed="oceny")
      ratingss = stri_replace_all(rating_second,"",fixed="głosy")
      seen = gsub(" ", "", ratingss, fixed = TRUE)
      
      # Add spaces to remove extra genres
      genre= gsub("([a-z])([A-Z])", "\\1 \\2", genre)
      genre= gsub("([ł])([A-Z])", "\\1 \\2 \\3", genre)
      country = gsub("([a-z])([A-Z])", "\\1 \\2", country)
      country = gsub("([A])([A-Z])","\\1 \\2",country)
      
      # Cleaning (leaving only one factor)
      genre = gsub('([A-z]+) .*', '\\1', genre)
      genre = gsub('([ł]+) .*', '\\1', genre)
      country = gsub('([A-z]+) .*', '\\1 \\2',country)
      
      
      # Alliasing
      title = html_text(get_title_html)
      i_want_to_see = html_text(get_iwanttosee_html)
      i_want_to_see = gsub(" ", "", i_want_to_see, fixed = TRUE)
      
      
      # Building datatable
      data_table = tibble(movie_title = title,
                          movie_score = ratingss,
                          movie_genre = genre,
                          release_country = country,
                          people_seen = seen,
                          people_want_to_see = i_want_to_see)
      
      data_base = rbind(data_base,data_table, check.rows=FALSE)
      
      
    }
  }, error=function(e){})
  
  data_base = data_base[data_base$movie_score !=FALSE,]

  # Basic EDA analysis
  
  data_base_table_genre = as.data.frame(table(data_base$movie_genre))
  data_base_table_country = as.data.frame(table(data_base$release_country))
  data_base_table_score = as.data.frame(table(data_base$movie_score))

  
  plot1 <- ggplot(data_base_table_genre, aes(x=Var1, y=Freq, fill=Var1)) + geom_bar(stat="identity") + 
    labs(x="Genres", y="Proportion") + ggtitle("Movie Genres") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
  
  ggsave("basic.png")
 
  plot2 <- ggplot(data_base_table_country, aes(x=Var1, y=Freq, fill=Var1)) + geom_bar(stat="identity") + 
    labs(x="Countries", y="Proportion") + ggtitle("Release Countries") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
  
  ggsave("basic2.png")
  
  plot3 <- ggplot(data_base_table_score, aes(x=Var1, y=Freq, fill=Var1)) + geom_bar(stat="identity") + 
     labs(x="Scores", y="Proportion") + ggtitle("Movie Scores") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + geom_density()
   
  ggsave("basic3.png")
  
  if (silent == FALSE) {
    View(data_base)
    print(plot1)
    print(plot2)
    print(plot3)
  }
  
  write_csv(data_base,'Data_basic.csv')
}

3 Scraping Function: Dplyr

scrappyDPLYR = function(pages = 10, url = 'https://www.filmweb.pl/films/search?&orderBy=popularity', silent = FALSE){
  
  if(!is.numeric(pages)){
    stop("The number of pages must be... a number.")
  }
  
  if(!is.character(url)){
    stop("The url must be a string.")
  }
  
  # Try to do the loop
  tryCatch({
    results = tibble() # A tibble to hold the results
    
    # Iterate over the pages
    for(i in 1:pages){
      
      # Read HTML
      movies <- url %>% 
        paste0('&page=', i) %>% 
        read_html()
      
      title <- movies %>% 
        html_nodes(".filmPreview__title") %>% # Extract by CSS class
        html_text() # Get only the text
      
      ratings <- movies %>%
        html_nodes(".rateBox__rate") %>% 
        html_text() %>% 
        stri_replace_all(".",fixed=",") %>% # Convert the decimal points
        stri_replace_all("",fixed="oceny") %>% # Remove unwanted text
        stri_replace_all("",fixed="głosy") %>% 
        gsub(" ", "", ., fixed = TRUE) # Get rid of trailing spaces
      
      genre <- movies %>% 
        html_nodes(".filmPreview__info--genres") %>% 
        html_text() %>% 
        stri_replace_all("", fixed ="gatunek:") %>% 
        gsub("([a-z])([A-Z])", "\\1 \\2", .) %>% # Add to separate the extra genres
        gsub("([ł])([A-Z])", "\\1 \\2 \\3", .) %>% # Add to separate the extra genres
        gsub('([A-z]+) .*', '\\1', .) %>% # Leave only the first genre
        gsub('([ł]+) .*', '\\1', .) # Leave only the first genre
      
      country <- movies %>% 
        html_nodes(".filmPreview__info--countries") %>% 
        html_text() %>% 
        stri_replace_all("", fixed ="kraj:") %>% 
        gsub("([a-z])([A-Z])", "\\1 \\2", .) %>%
        gsub("([A])([A-Z])","\\1 \\2", .) %>%
        gsub('([A-z]+) .*', '\\1 \\2', .)
      
      seen <- movies %>% 
        html_nodes(".rateBox__votes") %>% 
        html_text() %>% 
        stri_replace_all("", fixed = "oceny") %>% 
        stri_replace_all("", fixed ="ocen") %>% 
        gsub(" ", "", ., fixed = TRUE) %>%
        .[. != ""] # Remove empty strings
      
      wantToSee <- movies %>% 
        html_nodes(".wantToSee__count") %>% 
        html_text() %>% 
        gsub(" ", "", ., fixed <- TRUE)

      # Adding the results to the data table
      results <- tibble(
        movie_title = title,
        movie_score = ratings,
        movie_genre = genre,
        release_country = country,
        people_seen = seen,
        people_want_to_see = wantToSee) %>%
          rbind(results, .,  check.rows=FALSE) # Bind the results with the existing tibble
      
    }
  }, error=function(e){})
  
  # Filter the data
  results <- results[results$movie_score !=FALSE,]
  
  # Write the data as a CSV for futher use
  results %>% write_csv('dataDPLYR.csv')
  
  # Show and export basic plots to check the validity of the data
  plot1 <- results$movie_genre %>% 
    table() %>% 
    as.data.frame() %>% 
    ggplot(aes(x=., y=Freq, fill=.)) +
    geom_bar(stat="identity") + 
    labs(x="Genres", y="Proportion") + 
    ggtitle("Movie Genres") + 
    theme(axis.text.x = element_text(angle = 60, hjust = 1))
  ggsave("dplyr.png")

  
  plot2 <- results$release_country %>% 
    table() %>% 
    as.data.frame() %>% 
    ggplot(aes(x=., y=Freq, fill=.)) + 
    geom_bar(stat="identity") + 
    labs(x="Countries", y="Proportion") + 
    ggtitle("Release Countries") + 
    theme(axis.text.x = element_text(angle = 60, hjust = 1))
  ggsave("dplyr2.png")

  
  plot3 <- results$movie_score %>% 
    table() %>% 
    as.data.frame() %>% 
    ggplot(aes(x=., y=Freq, fill=.)) + 
    geom_bar(stat="identity") + 
    labs(x="Scores", y="Proportion") + 
    ggtitle("Movie Scores") + 
    theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
    geom_density()
  ggsave("dplyr3.png")

  
  if (silent == FALSE) {
    results %>% View()
    plot1 %>% print()
    plot2 %>% print()
    plot3 %>% print()
  }

}

############################################################################################
# Function execution
############################################################################################
  
# Let's run the functions over 10 pages
pages <- 10
  
# Base url for the scrapping - can contain filters in the form of GET parameters
url <- 'https://www.filmweb.pl/films/search?endRate=10&orderBy=popularity&descending=true&startCount=100&startRate=1'
  
# Run the functions
scrappy(pages,url)

scrappyDPLYR(pages,url)

4 Benchmarking

### Let's test the performance of both functions. 
#Both are set to silent, so they don't show the graphs and tables, but they still save the results
time <- microbenchmark(
  scrappy(10, url, silent = TRUE),
  scrappyDPLYR(10, url, silent = TRUE),
  times=10)

# Print the results
a <- time %>% print(unit = "s", order = 'median', signif = 3)

kable(a)

expr	min	lq	mean	median	uq	max	neval	cld
scrappy(10, url, silent = TRUE)	5.68	5.78	6.9	6.32	7.36	9.73	10	a
scrappyDPLYR(10, url, silent = TRUE)	5.90	6.61	6.9	6.79	6.96	8.70	10	a

# A nice graph
time %>% ggplot2::autoplot()

# let's see if there is a statistical difference
x <- time %>% filter(expr == 'scrappy(10, url, silent = TRUE)') %>% transmute(time = time/1000000000) # filter and convert values to seconds
y <- time %>% filter(expr == 'scrappyDPLYR(10, url, silent = TRUE)') %>% transmute(time = time/1000000000)

t.test(x$time,y$time) # The difference in mean values is not statistically significant


    Welch Two Sample t-test

data:  x$time and y$time
t = -0.00082547, df = 13.725, p-value = 0.9994
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.129430  1.128562
sample estimates:
mean of x mean of y 
 6.895365  6.895798