Beginner’s Guide on Web Scraping in R

#Loading the rvest package
library('rvest')
## Warning: package 'rvest' was built under R version 4.3.1
library(ggplot2)
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.1
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(stringr)
#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

#Reading the HTML code from the website
webpage <- read_html(url)
#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

#Converting the ranking data to text
rank_data <- html_text(rank_data_html)

#Let's have a look at the rankings
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

#Let's have another look at the rankings
head(rank_data)
## [1] 1 2 3 4 5 6
#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)

#Let's have a look at the title
head(title_data)
## [1] "Arrival"                            "Hacksaw Ridge"                     
## [3] "Terrifier"                          "Suicide Squad"                     
## [5] "Batman v Superman: Dawn of Justice" "Me Before You"
#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

#Let's have a look at the description data
head(description_data)
## [1] "\nA linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."                                                                                  
## [2] "\nWorld War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
## [3] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                        
## [4] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                 
## [5] "\nFearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."                                                       
## [6] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

#Let's have another look at the description data 
head(description_data)
## [1] "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."                                                                                  
## [2] "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
## [3] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                        
## [4] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                 
## [5] "Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."                                                       
## [6] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

#Let's have a look at the runtime
head(runtime_data)
## [1] "116 min" "139 min" "85 min"  "123 min" "151 min" "106 min"
#Data-Preprocessing: removing mins and converting it to numerical

runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

#Let's have another look at the runtime data
head(runtime_data)
## [1] 116 139  85 123 151 106
#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

#Let's have a look at the runtime
head(genre_data)
## [1] "\nDrama, Mystery, Sci-Fi            "    
## [2] "\nBiography, Drama, History            " 
## [3] "\nHorror, Thriller            "          
## [4] "\nAction, Adventure, Fantasy            "
## [5] "\nAction, Adventure, Sci-Fi            " 
## [6] "\nDrama, Romance            "
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

#Convering each genre from text to factor
genre_data<-as.factor(genre_data)

#Let's have another look at the genre data
head(genre_data)
## [1] Drama     Biography Horror    Action    Action    Drama    
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

#Let's have a look at the ratings
head(rating_data)
## [1] "7.9" "8.1" "5.6" "5.9" "6.4" "7.4"
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

#Let's have another look at the ratings data
head(rating_data)
## [1] 7.9 8.1 5.6 5.9 6.4 7.4
#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

#Converting the votes data to text
votes_data <- html_text(votes_data_html)

#Let's have a look at the votes data
head(votes_data)
## [1] "722,941" "553,449" "43,208"  "701,360" "714,498" "268,270"
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

#Let's have another look at the votes data
head(votes_data)
## [1] 722941 553449  43208 701360 714498 268270
#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

#Converting the directors data to text
directors_data <- html_text(directors_data_html)

#Let's have a look at the directors data
head(directors_data)
## [1] "Denis Villeneuve" "Mel Gibson"       "Damien Leone"     "David Ayer"      
## [5] "Zack Snyder"      "Thea Sharrock"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)

#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)

#Let's have a look at the actors data
head(actors_data)
## [1] Amy Adams       Andrew Garfield Jenna Kanell    Will Smith     
## [5] Ben Affleck     Emilia Clarke  
## 90 Levels: Aamir Khan Adam Driver Alexander Skarsgård ... Zoey Deutch
#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')

#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)

#Let's have a look at the metascore data
head(metascore_data)
## [1] "81        " "71        " "40        " "44        " "51        "
## [6] "65        "
#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)

#Lets check the length of metascore data
length(metascore_data)
## [1] 95
ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>%
# scrape the ratings bar and convert to text
 html_text2()
head(ratings_bar_data) # look at the ratings bar
## [1] "7.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.9/10 X \n81 Metascore"
## [2] "8.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 8.1/10 X \n71 Metascore"
## [3] "5.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.6/10 X "              
## [4] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
## [5] "6.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 6.4/10 X \n44 Metascore"
## [6] "7.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.4/10 X \n51 Metascore"
metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%
# extract Metascore
 str_match("\\d{2}") %>%
 as.numeric() # convert to number
length(metascore_data)
## [1] 100
metascore_data
##   [1] 81 71 NA 40 44 51 65 26 94 81 74 70 78 85 62 65 25 48 NA 67 81 43 36 75 59
##  [26] 73 72 41 54 88 74 99 21 52 51 48 66 NA 96 68 72 66 76 71 66 57 42 60 51 77
##  [51] 38 66 90 61 65 58 58 44 56 59 55 32 40 28 52 NA 47 77 77 65 62 33 81 68 42
##  [76] 60 32 79 69 81 60 47 57 42 80 78 39 72 58 78 46 66 35 58 64 45 69 51 23 NA
summary(metascore_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   47.00   61.00   60.05   72.50   99.00       5
#Using CSS selectors to scrape the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')

#Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)

#Let's have a look at the votes data
head(gross_data)
## [1] "$100.55M" "$67.21M"  "$325.10M" "$330.36M" "$56.25M"  "$363.07M"
#Data-Preprocessing: removing '$' and 'M' signs
gross_data<-gsub("M","",gross_data)

gross_data<-substring(gross_data,2,6)

#Let's check the length of gross data
length(gross_data)
## [1] 91
#Filling missing entries with NA
for (i in c(17,39,49,52,57,64,66,73,76,77,80,87,88,89)){

a<-gross_data[1:(i-1)]

b<-gross_data[i:length(gross_data)]

gross_data<-append(a,list("NA"))

gross_data<-append(gross_data,b)

}

#Data-Preprocessing: converting gross to numerical
gross_data<-as.numeric(gross_data)
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
#Let's have another look at the length of gross data
length(gross_data)
## [1] 105
summary(gross_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.01   24.23   56.25   99.25  125.80  532.10      14
# scrape the votess bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
 html_text2()
head(votes_bar_data) # look at the votes bar data
## [1] "Votes: 722,941 | Gross: $100.55M" "Votes: 553,449 | Gross: $67.21M" 
## [3] "Votes: 43,208"                    "Votes: 701,360 | Gross: $325.10M"
## [5] "Votes: 714,498 | Gross: $330.36M" "Votes: 268,270 | Gross: $56.25M"
4
## [1] 4
gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign
gross_data <- substring(gross_data,2,6) %>% # clean data: remove '$' sign
 as.numeric()
length(gross_data)
## [1] 100
#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data,

Description = description_data, Runtime = runtime_data,

Genre = genre_data, Rating = rating_data,

Metascore = metascore_data, Votes = votes_data,                                                             Gross_Earning_in_Mil = gross_data,

Director = directors_data, Actor = actors_data)

#Structure of the data frame

str(movies_df)
## 'data.frame':    100 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr  "Arrival" "Hacksaw Ridge" "Terrifier" "Suicide Squad" ...
##  $ Description         : chr  "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appea"| __truncated__ "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill peop"| __truncated__ "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ ...
##  $ Runtime             : num  116 139 85 123 151 106 108 111 128 107 ...
##  $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 7 4 8 1 1 7 1 7 5 3 ...
##  $ Rating              : num  7.9 8.1 5.6 5.9 6.4 7.4 8 6.5 8 7.6 ...
##  $ Metascore           : num  81 71 NA 40 44 51 65 26 94 81 ...
##  $ Votes               : num  722941 553449 43208 701360 714498 ...
##  $ Gross_Earning_in_Mil: num  100.5 67.2 NA 325.1 330.3 ...
##  $ Director            : Factor w/ 97 levels "Alex Proyas",..: 29 62 20 25 97 90 93 83 19 81 ...
##  $ Actor               : Factor w/ 90 levels "Aamir Khan","Adam Driver",..: 4 5 41 88 8 28 74 10 73 7 ...

Question 1

Based on the above data, which movie from which Genre had the longest runtime?

ggplot(movies_df, aes(x = Runtime, fill = Genre)) +
  geom_histogram(binwidth = 10, position = "identity") +
  labs(x = "Runtime", y = "Count", fill = "Genre") +
  ggtitle("Distribution of Movie Runtimes by Genre")

## Removing all data that doesn’t help me answer the question. I only look at action movies because that genre clearly has the longest runtime from the graph above

action_movies <- movies_df[movies_df$Genre == "Action", ]

p1 <- ggplot(action_movies, aes(x = Runtime, fill = Genre)) +
  geom_histogram(binwidth = 10, position = "identity") +
  labs(x = "Runtime", y = "Count") +
  ggtitle("Distribution of Movie Runtimes for Action Genre")
p1

## Answer 1 Based on the graph below, Batman v Superman: Dawn of Justice (Ultimate Edition) from the action genre has the longest runtime.

p2 <- ggplot(action_movies, aes(x = Title, y = Runtime)) +
  geom_bar(stat = "identity") +
  labs(x = "Title", y = "Runtime", title = "Runtime of Action Genre Movies") +
  coord_flip()
p2

## Question 2 Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))

## Removing data that doesn’t help me answer the question I filter by movie Runtime so I only see 130min - 160min, then look to use plotly to make the graph interactive so I could see the values as I hover over.

filtered_movies <- movies_df[movies_df$Runtime >= 130 & movies_df$Runtime <= 160, ]

p3 <- ggplot(filtered_movies, aes(x = Runtime, y = Rating)) +
  geom_point(aes(size = Votes, col = Genre)) +
  labs(x = "Runtime (minutes)", y = "Rating", title = "Runtime vs Rating by Genre") 
p3

## Answer 2 The Action genre clearly has the most votes in this range of runtime.

p4 <- ggplotly(p3)
p4

Question 3

Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))
## Warning: Removed 9 rows containing missing values (`geom_point()`).

## Filter out the data I first filter by runtimes so I only deal with the 100-120min range.

filtered_movies1 <- movies_df[movies_df$Runtime >= 100 & movies_df$Runtime <= 120, ]
ggplot(filtered_movies1, aes(x = Runtime, y = Gross_Earning_in_Mil)) +
  geom_point(aes(size = Rating, col = Genre)) +
  labs(x = "Runtime (minutes)", y = "Gross Earnings (Millions)", title = "Runtime vs Gross Earnings by Genre")
## Warning: Removed 1 rows containing missing values (`geom_point()`).

## Calculate the mean for each genre Then make use data.frame to make a new dataset so it is easier to plot.

mean_gross_by_genre <- tapply(filtered_movies1$Gross_Earning_in_Mil, filtered_movies1$Genre, mean)

mean_gross_df <- data.frame(Genre = names(mean_gross_by_genre), Mean_Gross = mean_gross_by_genre)

Answer 3

I use the new dataset to make a graph and see that Animation had the highest average earnings for the chosen runtime.

p6 <-ggplot(mean_gross_df, aes(x = Genre, y = Mean_Gross)) +
  geom_bar(stat = "identity") +
  labs(x = "Genre", y = "Mean Gross Earnings (Millions)", title = "Mean Gross Earnings by Genre") +
  coord_flip()
p6
## Warning: Removed 1 rows containing missing values (`position_stack()`).