What is Web Scraping?
Web scraping is a technique for converting the data present in unstructured format (HTML tags) over the web to the structured format which can easily be accessed and used.
Almost all the main languages provide ways for performing web scrapping. In this article, we’ll use R for scrapping the data for the most popular feature films of 2016 from the IMDb website.
We’ll get a number of features for each of the 100 popular feature films released in 2016. Also, we’ll look at the most common problems that one might face while scrapping data from the internet.
Creating a structured dataframe which contains details from IMDB database:
Automatic dealing with missing values. If any value is not available for particular feature then, NA would take its place.
You can get the extension selector gadget, easily available on chrome store.
Web scraping using R becomes quite handy with the help of “rvest” package.
At times it becomes difficult when pattern is hard to find in web pages.
You require css class to fetch desired text from the web-page. Follow the steps to get the same.
Step 1:
Get the selector gadget for chrome :
https://chrome.google.com/webstore/detail/selectorgadget/mhjhnkcfbdhnjickkkdbjoemdmbfginb?hl=en
Step 2:
Step 3:
library(rvest) #web-scraping package
library(dplyr) #data manipulation package.
library(data.table) #data sp
library(pander)
url = "http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature&page=1"
webpage<-read_html(url)
### rank
rank_data_html<- html_nodes(webpage, '.text-primary') # ".text-primary" is css selector output for rank data on that page.
rank_data<- html_text(rank_data_html) #extract text
df <- tibble(rank = rank_data) # initiating dataframe with rank data
#though ranks are not numeric we'll take care of that later.
df%>%head(n=2)%>%as.data.frame()%>%pandoc.table()
##
## ------
## rank
## ------
## 1.
##
## 2.
## ------
### title
title_data_html <- html_nodes(webpage,
'.lister-item-header a') # extracting name containing nodes from webpage
title_data<- html_text(title_data_html, trim = T) # extracting text from nodes
## There is no need to apply check whether the length of titles and ranks are equal (i.e. 100 here), because for all ranks, titles of movies are always available. Hence no missing value in between.
df <- mutate(df, title = title_data) #mutating in dataframe giving feature name "title"
# lets have a look at df
df%>%head(n=2)%>%as.data.frame()%>%pandoc.table()
##
## -----------------------------------
## rank title
## ------ ----------------------------
## 1. Captain America: Civil War
##
## 2. Doctor Strange
## -----------------------------------
### description
description_data_html <- html_nodes(webpage,
'.ratings-bar+ .text-muted')
description_data<- html_text(description_data_html,trim = T)
if(length(description_data)==100){ #if TRUE, which it is, mutate in the existing dataframe
df <- mutate(df, description = description_data)
} else { # You will learn else block in depth where feature's length is not equal to 100.
# In this blog "gross" feature.
description_data_html <- html_nodes(webpage,
'.text-primary , .ratings-bar+ .text-muted')
description_data<- html_text(description_data_html,trim = T)
description<- data.frame(rank = NA, description = NA)
for( i in (!(description_data %in% rank_data))%>%which()){
description[i,1] = description_data[i-1]
description[i,2] = description_data[i]
}
df <- df%>%left_join(y = description, by= "rank")
}
# lets have a look at df again
df%>%head(n=2)%>%as.data.frame()%>%pandoc.table()
##
## --------------------------------------------------------------------
## rank title description
## ------ ---------------------------- --------------------------------
## 1. Captain America: Civil War Political involvement in the
## Avengers' activities causes a
## rift between Captain America
## and Iron Man.
##
## 2. Doctor Strange While on a journey of physical
## and spiritual healing, a
## brilliant neurosurgeon is
## drawn into the world of the
## mystic arts.
## --------------------------------------------------------------------
Here you can see that description feature is mutated with existing dataframe/tibble “df”.
### stars
star_data_html <- html_nodes(webpage,
'.ratings-imdb-rating strong')
star_data<- html_text(star_data_html, trim = T)
if(length(star_data)==100){
df <- mutate(df, star = star_data)
} else {
star_data_html <- html_nodes(webpage,
'.text-primary , .ratings-imdb-rating strong')
star_data<- html_text(star_data_html, trim = T)
star<- data.frame(rank = NA, star = NA)
for( i in (!(star_data %in% rank_data))%>%which()){
star[i,1] = star_data[i-1]
star[i,2] = star_data[i]
}
star$star<-as.numeric(star$star)
df <- df%>%left_join(y = star, by= "rank")
}
# lets have a look at df again
df%>%head(n=2)%>%as.data.frame()%>%pandoc.table()
##
## ---------------------------------------------------------------------------
## rank title description star
## ------ ---------------------------- -------------------------------- ------
## 1. Captain America: Civil War Political involvement in the 7.8
## Avengers' activities causes a
## rift between Captain America
## and Iron Man.
##
## 2. Doctor Strange While on a journey of physical 7.5
## and spiritual healing, a
## brilliant neurosurgeon is
## drawn into the world of the
## mystic arts.
## ---------------------------------------------------------------------------
### genre
genre_data_html <- html_nodes(webpage,
'.genre')
genre_data<- html_text(genre_data_html, trim = T)
if(length(genre_data)==100){
df <- mutate(df, genre = genre_data)
} else {
genre_data_html <- html_nodes(webpage,
'.text-primary , .genre')
genre_data<- html_text(genre_data_html, trim = T)
genre<- data.frame(rank = NA, genre = NA)
for( i in (!(genre_data %in% rank_data))%>%which()){
genre[i,1] = genre_data[i-1]
genre[i,2] = genre_data[i]
}
df <- df%>%left_join(y = genre, by= "rank")
}
# lets look at df again
df%>%head(n=2)%>%as.data.frame()%>%pandoc.table()
##
## ---------------------------------------------------------------------------
## rank title description star
## ------ ---------------------------- -------------------------------- ------
## 1. Captain America: Civil War Political involvement in the 7.8
## Avengers' activities causes a
## rift between Captain America
## and Iron Man.
##
## 2. Doctor Strange While on a journey of physical 7.5
## and spiritual healing, a
## brilliant neurosurgeon is
## drawn into the world of the
## mystic arts.
## ---------------------------------------------------------------------------
##
## Table: Table continues below
##
##
## ----------------------------
## genre
## ----------------------------
## Action, Adventure, Sci-Fi
##
## Action, Adventure, Fantasy
## ----------------------------
### running time
running_time_data_html <- html_nodes(webpage,
'.text-muted .runtime')
running_time_data <- html_text(running_time_data_html)
if(length(running_time_data)==100){
running_time_data<-gsub(pattern = "\\smin$",replacement = "",
running_time_data)%>%as.numeric()
df <- mutate(df, running_time = running_time_data%>%as.numeric())
} else {
running_time_data_html <- html_nodes(webpage,
'.text-primary , .text-muted .runtime')
running_time_data<- html_text(running_time_data_html, trim = T)
running_time_data<-gsub(pattern = "\\smin$",replacement = "", #replacing "min" keyword with empty string.
running_time_data)
running_time<- data.frame(rank = NA, running_time = NA)
for( i in (!(running_time_data %in% rank_data))%>%which()){
running_time[i,1] = running_time_data[i-1]
running_time[i,2] = running_time_data[i]%>%as.numeric() # converting running time typeof from string to numeric.
}
df <- df%>%left_join(y = running_time, by= "rank")
}
# lets look at df again
df%>%head(n=2)%>%as.data.frame()%>%pandoc.table()
##
## ---------------------------------------------------------------------------
## rank title description star
## ------ ---------------------------- -------------------------------- ------
## 1. Captain America: Civil War Political involvement in the 7.8
## Avengers' activities causes a
## rift between Captain America
## and Iron Man.
##
## 2. Doctor Strange While on a journey of physical 7.5
## and spiritual healing, a
## brilliant neurosurgeon is
## drawn into the world of the
## mystic arts.
## ---------------------------------------------------------------------------
##
## Table: Table continues below
##
##
## -------------------------------------------
## genre running_time
## ---------------------------- --------------
## Action, Adventure, Sci-Fi 147
##
## Action, Adventure, Fantasy 115
## -------------------------------------------
### release year
release_year_data_html <- html_nodes(webpage,
'.text-muted.unbold')
release_year_data<- html_text(release_year_data_html)
if(length(release_year_data)==100){
release_year_data <- release_year_data%>%
stringr::str_extract(pattern = "\\d+")%>% # Since release year might have movie sequel number in it, therefore, extracting just release year from the text.
as.numeric() # converting release year to numeric form
df <- mutate(df, release_year = release_year_data)
} else {
release_year_data_html <- html_nodes(webpage,
'.text-primary , .text-muted.unbold')
release_year_data<- html_text(release_year_data_html, trim = T)
release_year<- data.frame(rank = NA, release_year = NA)
for( i in (!(release_year_data %in% rank_data))%>%which()){
release_year[i,1] = release_year_data[i-1]
release_year[i,2] = release_year_data[i]
}
release_year$release_year<- release_year$release_year%>%
stringr::str_extract(pattern = "\\d+")%>%as.numeric()
df <- df%>%left_join(y = release_year, by= "rank")
}
Gross feature contains some missing values. Here you’ll get to know how to deal with missing values automatically through else block’s logic.
### gross edition required
gross_data_html <- html_nodes(webpage,
'.ghost~ .text-muted+ span')#nodes containing just gross
gross_data<- html_text(gross_data_html) #text extraction
if(length(gross_data)==100){ # check if length of gross_data content is equal to length of ranks i.e. 100 : here it's FALSE
# Let's traverse through else block
df <- mutate(df, gross = gross_data)
} else {
# Previously you did not know which movies had missing values.
# like if you print gross_data it'll just return set of 87 values(as per records when I fetched.)
# So, to deal with this, else block gives treatment differently.
# else extract rank and description together.
gross_data_html <- html_nodes(webpage,
'.text-primary , .ghost~ .text-muted+ span')
# css '.text-primary' (for rank), '.ghost~ .text-muted+ span'(for gross)
gross_data<- html_text(gross_data_html)
# Here our aim is to create a temporary dataframe such that its first column contain "rank" and second column contain its "gross".
# Thereafter, left join existing dataframe with matching entries by "rank".
gross <- data.frame(rank = NA, gross = NA)# creating an empty dataframe
for(i in (!(gross_data%in%rank_data))%>%which){
# Gross data would contain rank and Gross in a vector combined therefore, traversing through all the entries which are not rank.
# To do so we match rank entries then take its negation. So the leftover entries are only gross.
# It is also obvious that each rank entry must be followed by its gross entry .
# Therefore, putting each gross entry in second column and its preceding rank entry in first column, in the same row.
gross[i,2] = stringr::str_extract(gross_data[i], "\\d+\\.\\d+")%>%as.numeric() # putting rank in first column
gross[i,1] = gross_data[i-1] # putting "gross" in second
}
df<- df%>%left_join(y = gross, by= "rank")# joining the "gross" dataframe by rank in existing dataframe.
}
Same check was applied on features already fetched so far like genre, running time, etc. And same will be applied on upcoming features like votes, certification, etc.
### votes
vote_data_html <- html_nodes(webpage,
'.sort-num_votes-visible span:nth-child(2)')
vote_data<- html_text(vote_data_html)
if(length(vote_data) == 100){
vote_data<-gsub(pattern = ",",replacement = "",x = vote_data)%>%as.numeric()
df <- mutate(df, vote = vote_data)
} else {
vote_data_html <- html_nodes(webpage,
'.text-primary , .sort-num_votes-visible span:nth-child(2)')
vote_data<- html_text(vote_data_html)
vote <- data.frame(rank = NA, vote = NA)
for(i in (!(vote_data %in% rank_data))%>%which){
vote[i,2] = vote_data[i]
vote[i,1] = vote_data[i-1]
}
vote$vote <- gsub(pattern = ",",replacement = "",x = vote$vote)%>%as.numeric()
df<- df%>%left_join(y = vote, by= "rank")
}
### certificate
certificate_data_html <- html_nodes(webpage,
'.certificate')
certificate_data<- html_text(certificate_data_html)
if(length(certificate_data)==100){
df <- mutate(df, certificate = certificate_data)
} else {
certificate_data_html <- html_nodes(webpage,
'.text-primary , .certificate')
certificate_data<- html_text(certificate_data_html)
certificate<-data.frame(rank = NA, certificate = NA)
for(i in (!(certificate_data %in% rank_data))%>%which()){
certificate[i,1] = paste(certificate_data[i-1])
certificate[i,2] = paste(certificate_data[i])
}
df<- df%>%left_join(y = certificate, by= "rank")
}
### meta score
meta_score_data_html <- html_nodes(webpage,
'.favorable')
meta_score_data<- html_text(meta_score_data_html,trim = T)
if( length(meta_score_data)==100){
df <- mutate(df, meta_score = meta_score_data)
} else {
meta_score_data_html <- html_nodes(webpage,
'.text-primary , .favorable')
meta_score_data<- html_text(meta_score_data_html,trim = T)
meta_score = data.frame(rank = NA, `meta_score`= NA)
if(!all(meta_score_data%in%rank_data)){ # adding exception if no entry of meta score is present.
# if all ranks are matched in rank_data, implies there is no metascore value present in extracted text.
# You must be thinking why I did not add this exception in other features. The reason is it's higly unlikely to not get even a single value of that feature in extracted text.
for(i in (!(meta_score_data%in%rank_data))%>%which()){
meta_score[i,1] = meta_score_data[i-1]
meta_score[i,2] = meta_score_data[i]%>%as.numeric()
}
df<- df%>%left_join(y = meta_score, by= "rank")
} else {
df<- df%>%mutate(meta_score = NA)
}
}
###director
## css selector selects both directors and actors together, therefore, you need to separate directors and actors manually.
director_data<-(html_nodes(webpage, '.text-muted+ p')%>%
html_text(trim = T)%>%
tstrsplit(split = ":|\\|",keep = 2))[[1]]%>%trimws()
if(length(director_data)==100){
df <- mutate(df, director = director_data)
} else {
director_data<-(html_nodes(webpage, '.text-muted+ p')%>%
html_text(trim = T)%>%
tstrsplit(split = ":|\\|",keep = 2))[[1]]%>%trimws()
director <- data.frame(rank = NA, director = NA)
for(i in (!(director_data%in%rank_data))%>%which){
director[i,1] = director_data[i-1]
director[i,2] = director_data[i]
}
df<- df%>%left_join(y = director, by= "rank")
}
###actor
actor_data<- (html_nodes(webpage, '.text-muted+ p')%>%
html_text()%>%
tstrsplit(split = ":|\\|",keep = 4)%>%
as.data.frame())[[1]]%>%
trimws()
if(length(actor_data)==100){
df <- mutate(df, actor_data)
} else {
actor_data<-html_nodes(webpage, '.text-primary , .text-muted+ p')%>%
html_text(trim = T)%>%
tstrsplit(split = ":|\\|", keep = c(1,4),names = c("rank", "actor"))%>%as.data.frame(stringsAsFactors = F)
actor_data<-within(actor_data,{
actor[is.na(actor)] <- rank[is.na(actor)]
})[[2]]
actor <- data.frame(rank = NA, actor = NA)
for(i in (!(actor_data%in%rank_data))%>%which){
actor[i,1] = actor_data[i-1]
actor[i,2] = actor_data[i]
}
df<- df%>%left_join(y = actor, by= "rank")
}
df[[1]]<-gsub(pattern = ",",replacement = "",x = df[[1]])%>%
as.numeric() ## converting rank feature in numeric form
pandoc.table(df[1,]%>%as.data.frame())
##
## --------------------------------------------------------------------------
## rank title description star
## ------ ---------------------------- ------------------------------- ------
## 1 Captain America: Civil War Political involvement in the 7.8
## Avengers' activities causes a
## rift between Captain America
## and Iron Man.
## --------------------------------------------------------------------------
##
## Table: Table continues below
##
##
## --------------------------------------------------------------------------
## genre running_time release_year gross vote
## --------------------------- -------------- -------------- ------- --------
## Action, Adventure, Sci-Fi 147 2016 408.1 494962
## --------------------------------------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------------------------------
## certificate meta_score director
## ------------- ------------ --------------------------
## UA 75 Anthony Russo, Joe Russo
## -----------------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------
## actor_data
## -----------------------------
## Chris Evans, Robert Downey
## Jr., Scarlett Johansson,
## Sebastian Stan
## -----------------------------