Webscraping Tool: Base R & Dplyr Approach
1 Advanced R Project: WebScraping Tool
The aim of this project is to build a webscraping function in R based on two different approaches. First one uses mostly Base R, and the second one use Dplyr. We want to compare the time effinciency of both functions, to see if there is a difference of speed between these types of programming.
Our WebScraping Tool, scrapes the basic information from polish movie database Filmweb and performs basic EDA analysisto allow for visual validation of the results. One can choose another website to scrape with this function, yet in such a case, one needs to provide a new starting URL and define “nodes” to scrape from the website. Our function is pretty intuitive and easy to understand, so we think for someone familiar with R, it will be easy to use.
2 Scraping Function: Base R
# Define the function
scrappy = function(number_of_pages,url, silent = FALSE){
# Check the validity of arguments
if(is.na(url) | is.na(number_of_pages)){
stop("Provide at least something...")
}
# Ask the user if he wants to conver the variable
if(!is.numeric(number_of_pages) | number_of_pages==0){
print("Unable to perform scraping because you provided incorrect input, make sure number of pages is an integer (not 0)!")
statement <- readline("Convert it for you? (Yes/No) > ")
if(statement == "Yes"){
number_of_pages <- as.integer(round(number_of_pages, digits = 0))
}
else{
stop("The function will halt.")
}
}
tryCatch({
# Make the page strings
list_of_pages <- str_c(url, '&page=', 2:number_of_pages)
data_base = tibble()
# Iterate over the pages
for(i in 1:number_of_pages){
# Read HTML
baza_filmow = read_html(list_of_pages[i])
get_title_html = html_nodes(baza_filmow,".filmPreview__title")
get_rating_html = html_nodes(baza_filmow,".rateBox__rate")
get_genre_html = html_nodes(baza_filmow,".filmPreview__info--genres")
get_country_html = html_nodes(baza_filmow,".filmPreview__info--countries")
get_seen_html = html_nodes(baza_filmow,".rateBox__votes")
get_iwanttosee_html = html_nodes(baza_filmow,".wantToSee__count")
# Delete unnecessary strings
remove = html_text(get_seen_html)
remove_first = stri_replace_all(remove,"",fixed="głosy")
seen = stri_replace_all(remove_first,"",fixed="głosow")
seen = gsub(" ", "", seen, fixed = TRUE)
remove_second = html_text(get_genre_html)
genre = stri_replace_all(remove_second,"",fixed="gatunek:")
remove_third = html_text(get_country_html)
country = stri_replace_all(remove_third,"",fixed="kraj:")
remove_fourth = html_text(get_rating_html)
rating = stri_replace_all(remove_fourth,".",fixed=",")
rating_second = stri_replace_all(rating,"",fixed="oceny")
ratingss = stri_replace_all(rating_second,"",fixed="głosy")
seen = gsub(" ", "", ratingss, fixed = TRUE)
# Add spaces to remove extra genres
genre= gsub("([a-z])([A-Z])", "\\1 \\2", genre)
genre= gsub("([ł])([A-Z])", "\\1 \\2 \\3", genre)
country = gsub("([a-z])([A-Z])", "\\1 \\2", country)
country = gsub("([A])([A-Z])","\\1 \\2",country)
# Cleaning (leaving only one factor)
genre = gsub('([A-z]+) .*', '\\1', genre)
genre = gsub('([ł]+) .*', '\\1', genre)
country = gsub('([A-z]+) .*', '\\1 \\2',country)
# Alliasing
title = html_text(get_title_html)
i_want_to_see = html_text(get_iwanttosee_html)
i_want_to_see = gsub(" ", "", i_want_to_see, fixed = TRUE)
# Building datatable
data_table = tibble(movie_title = title,
movie_score = ratingss,
movie_genre = genre,
release_country = country,
people_seen = seen,
people_want_to_see = i_want_to_see)
data_base = rbind(data_base,data_table, check.rows=FALSE)
}
}, error=function(e){})
data_base = data_base[data_base$movie_score !=FALSE,]
# Basic EDA analysis
data_base_table_genre = as.data.frame(table(data_base$movie_genre))
data_base_table_country = as.data.frame(table(data_base$release_country))
data_base_table_score = as.data.frame(table(data_base$movie_score))
plot1 <- ggplot(data_base_table_genre, aes(x=Var1, y=Freq, fill=Var1)) + geom_bar(stat="identity") +
labs(x="Genres", y="Proportion") + ggtitle("Movie Genres") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggsave("basic.png")
plot2 <- ggplot(data_base_table_country, aes(x=Var1, y=Freq, fill=Var1)) + geom_bar(stat="identity") +
labs(x="Countries", y="Proportion") + ggtitle("Release Countries") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggsave("basic2.png")
plot3 <- ggplot(data_base_table_score, aes(x=Var1, y=Freq, fill=Var1)) + geom_bar(stat="identity") +
labs(x="Scores", y="Proportion") + ggtitle("Movie Scores") + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + geom_density()
ggsave("basic3.png")
if (silent == FALSE) {
View(data_base)
print(plot1)
print(plot2)
print(plot3)
}
write_csv(data_base,'Data_basic.csv')
}3 Scraping Function: Dplyr
scrappyDPLYR = function(pages = 10, url = 'https://www.filmweb.pl/films/search?&orderBy=popularity', silent = FALSE){
if(!is.numeric(pages)){
stop("The number of pages must be... a number.")
}
if(!is.character(url)){
stop("The url must be a string.")
}
# Try to do the loop
tryCatch({
results = tibble() # A tibble to hold the results
# Iterate over the pages
for(i in 1:pages){
# Read HTML
movies <- url %>%
paste0('&page=', i) %>%
read_html()
title <- movies %>%
html_nodes(".filmPreview__title") %>% # Extract by CSS class
html_text() # Get only the text
ratings <- movies %>%
html_nodes(".rateBox__rate") %>%
html_text() %>%
stri_replace_all(".",fixed=",") %>% # Convert the decimal points
stri_replace_all("",fixed="oceny") %>% # Remove unwanted text
stri_replace_all("",fixed="głosy") %>%
gsub(" ", "", ., fixed = TRUE) # Get rid of trailing spaces
genre <- movies %>%
html_nodes(".filmPreview__info--genres") %>%
html_text() %>%
stri_replace_all("", fixed ="gatunek:") %>%
gsub("([a-z])([A-Z])", "\\1 \\2", .) %>% # Add to separate the extra genres
gsub("([ł])([A-Z])", "\\1 \\2 \\3", .) %>% # Add to separate the extra genres
gsub('([A-z]+) .*', '\\1', .) %>% # Leave only the first genre
gsub('([ł]+) .*', '\\1', .) # Leave only the first genre
country <- movies %>%
html_nodes(".filmPreview__info--countries") %>%
html_text() %>%
stri_replace_all("", fixed ="kraj:") %>%
gsub("([a-z])([A-Z])", "\\1 \\2", .) %>%
gsub("([A])([A-Z])","\\1 \\2", .) %>%
gsub('([A-z]+) .*', '\\1 \\2', .)
seen <- movies %>%
html_nodes(".rateBox__votes") %>%
html_text() %>%
stri_replace_all("", fixed = "oceny") %>%
stri_replace_all("", fixed ="ocen") %>%
gsub(" ", "", ., fixed = TRUE) %>%
.[. != ""] # Remove empty strings
wantToSee <- movies %>%
html_nodes(".wantToSee__count") %>%
html_text() %>%
gsub(" ", "", ., fixed <- TRUE)
# Adding the results to the data table
results <- tibble(
movie_title = title,
movie_score = ratings,
movie_genre = genre,
release_country = country,
people_seen = seen,
people_want_to_see = wantToSee) %>%
rbind(results, ., check.rows=FALSE) # Bind the results with the existing tibble
}
}, error=function(e){})
# Filter the data
results <- results[results$movie_score !=FALSE,]
# Write the data as a CSV for futher use
results %>% write_csv('dataDPLYR.csv')
# Show and export basic plots to check the validity of the data
plot1 <- results$movie_genre %>%
table() %>%
as.data.frame() %>%
ggplot(aes(x=., y=Freq, fill=.)) +
geom_bar(stat="identity") +
labs(x="Genres", y="Proportion") +
ggtitle("Movie Genres") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggsave("dplyr.png")
plot2 <- results$release_country %>%
table() %>%
as.data.frame() %>%
ggplot(aes(x=., y=Freq, fill=.)) +
geom_bar(stat="identity") +
labs(x="Countries", y="Proportion") +
ggtitle("Release Countries") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
ggsave("dplyr2.png")
plot3 <- results$movie_score %>%
table() %>%
as.data.frame() %>%
ggplot(aes(x=., y=Freq, fill=.)) +
geom_bar(stat="identity") +
labs(x="Scores", y="Proportion") +
ggtitle("Movie Scores") +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
geom_density()
ggsave("dplyr3.png")
if (silent == FALSE) {
results %>% View()
plot1 %>% print()
plot2 %>% print()
plot3 %>% print()
}
}############################################################################################
# Function execution
############################################################################################
# Let's run the functions over 10 pages
pages <- 10
# Base url for the scrapping - can contain filters in the form of GET parameters
url <- 'https://www.filmweb.pl/films/search?endRate=10&orderBy=popularity&descending=true&startCount=100&startRate=1'
# Run the functions
scrappy(pages,url)4 Benchmarking
### Let's test the performance of both functions.
#Both are set to silent, so they don't show the graphs and tables, but they still save the results
time <- microbenchmark(
scrappy(10, url, silent = TRUE),
scrappyDPLYR(10, url, silent = TRUE),
times=10)
# Print the results
a <- time %>% print(unit = "s", order = 'median', signif = 3)| expr | min | lq | mean | median | uq | max | neval | cld |
|---|---|---|---|---|---|---|---|---|
| scrappy(10, url, silent = TRUE) | 5.68 | 5.78 | 6.9 | 6.32 | 7.36 | 9.73 | 10 | a |
| scrappyDPLYR(10, url, silent = TRUE) | 5.90 | 6.61 | 6.9 | 6.79 | 6.96 | 8.70 | 10 | a |
# let's see if there is a statistical difference
x <- time %>% filter(expr == 'scrappy(10, url, silent = TRUE)') %>% transmute(time = time/1000000000) # filter and convert values to seconds
y <- time %>% filter(expr == 'scrappyDPLYR(10, url, silent = TRUE)') %>% transmute(time = time/1000000000)
t.test(x$time,y$time) # The difference in mean values is not statistically significant
Welch Two Sample t-test
data: x$time and y$time
t = -0.00082547, df = 13.725, p-value = 0.9994
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.129430 1.128562
sample estimates:
mean of x mean of y
6.895365 6.895798