Beginner’s Guide on Web Scraping HW

Author

Daniel B

Web Scraping in R (using rvest) Tutorial

Loading Package and Specifying URL

# Loading the rvest package
library('rvest')

# Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

# Reading the HTML code from the website
webpage <- read_html(url)

Step 3: Scraping and Displaying Rankings Using CSS Selectors

# Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

# Converting the ranking data to text
rank_data <- html_text(rank_data_html)

# Let's have a look at the rankings
head(rank_data)

[1] "1." "2." "3." "4." "5." "6."

Step 4: Data-Preprocessing: Converting Rankings to Numerical Data and Reviewing Rankings

# Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

# Let's have another look at the rankings
head(rank_data)

[1] 1 2 3 4 5 6

Step 6: Scraping and Displaying Title Data Using CSS Selectors

# Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')

# Converting the title data to text
title_data <- html_text(title_data_html)

# Let's have a look at the title
head(title_data)

[1] "Terrifier"       "Suicide Squad"   "Silence"         "Hush"           
[5] "The Conjuring 2" "Split"

Step 7: Scraping The Rest of the Data

Scraping and Displaying Description Data Using CSS Selectors

# Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

# Converting the description data to text
description_data <- html_text(description_data_html)

# Let's have a look at the description data
head(description_data)

[1] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                       
[2] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
[3] "\nIn the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is rumored to have committed apostasy, and to propagate Catholicism."  
[4] "\nA deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence when a masked killer appears at her window."                         
[5] "\nEd and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit."                                     
[6] "\nThree girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."

Data-Preprocessing: Cleaning Description Data and Reviewing It

# Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

# Let's have another look at the description data 
head(description_data)

[1] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                       
[2] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
[3] "In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is rumored to have committed apostasy, and to propagate Catholicism."  
[4] "A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence when a masked killer appears at her window."                         
[5] "Ed and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit."                                     
[6] "Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."

Scraping and Displaying Movie Runtime Using CSS Selectors

# Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

# Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

# Let's have a look at the runtime
head(runtime_data)

[1] "85 min"  "123 min" "161 min" "82 min"  "134 min" "117 min"

Data-Preprocessing: Removing “mins” and Converting Runtime to Numerical Data

# Data-Preprocessing: removing mins and converting it to numerical

runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

# Let's have another look at the runtime data
head(runtime_data)

[1]  85 123 161  82 134 117

Scraping and Displaying Movie Genre Using CSS Selectors

# Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

# Converting the genre data to text
genre_data <- html_text(genre_data_html)

# Let's have a look at the runtime
head(genre_data)

[1] "\nHorror, Thriller            "          
[2] "\nAction, Adventure, Fantasy            "
[3] "\nDrama, History            "            
[4] "\nHorror, Thriller            "          
[5] "\nHorror, Mystery, Thriller            " 
[6] "\nHorror, Thriller            "

Data-Preprocessing: Cleaning, Extracting First Genre, and Converting to Factors

# Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

# Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

# taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

# Converting each genre from text to factor
genre_data<-as.factor(genre_data)

# Let's have another look at the genre data
head(genre_data)

[1] Horror Action Drama  Horror Horror Horror
9 Levels: Action Adventure Animation Biography Comedy Crime Drama ... Horror

Scraping and Displaying IMDB Ratings Using CSS Selectors

#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

#Let's have a look at the ratings
head(rating_data)

[1] "5.6" "5.9" "7.2" "6.6" "7.3" "7.3"

Data-Preprocessing: Converting Ratings to Numerical Values and Reviewing the Data

#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

#Let's have another look at the ratings data
head(rating_data)

[1] 5.6 5.9 7.2 6.6 7.3 7.3

Scraping and Displaying Votes Data Using CSS Selectors

# Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

# Converting the votes data to text
votes_data <- html_text(votes_data_html)

# Let's have a look at the votes data
head(votes_data)

[1] "47,700"  "710,221" "119,465" "149,262" "292,293" "532,895"

Data-Preprocessing: Removing Commas and Converting Votes to Numerical Values, and Reviewing the Data

# Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

# Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

# Let's have another look at the votes data
head(votes_data)

[1]  47700 710221 119465 149262 292293 532895

Scraping and Displaying Directors Data Using CSS Selectors

# Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

# Converting the directors data to text
directors_data <- html_text(directors_data_html)

# Let's have a look at the directors data
head(directors_data)

[1] "Damien Leone"       "David Ayer"         "Martin Scorsese"   
[4] "Mike Flanagan"      "James Wan"          "M. Night Shyamalan"

Data-Preprocessing: Converting Directors Data to Factors and Scraping Actors Data Using CSS Selectors

# Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)

# Fix missing director 
directors_data[40] <- NA

# Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

# Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

# Let's have a look at the actors data
head(actors_data)

[1] "Jenna Kanell"       "Will Smith"         "Andrew Garfield"   
[4] "John Gallagher Jr." "Vera Farmiga"       "James McAvoy"

Data-Preprocessing: Converting actors data into factors

# Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)

Scraping and Displaying Metascore Data Using CSS Selectors

#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')

#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)

#Let's have a look at the metascore data
head(metascore_data)

[1] "40        " "79        " "67        " "65        " "63        "
[6] "71        "

Data-Preprocessing: Removing Extra Space in Metascore and Checking Data Length

#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)

#Lets check the length of metascore data
length(metascore_data)

[1] 95

Step 9: Get MetaScore to 100 and replace null in missing director

metascore_data <- c(NA, metascore_data)

for (i in c(40,44,65,97)){
a<-metascore_data[1:(i-1)]
b<-metascore_data[i:length(metascore_data)]
metascore_data<-append(a,list(NA))
metascore_data<-append(metascore_data,b)
}

#Data-Preprocessing: converting metascore to numerical
metascore_data<-as.numeric(metascore_data, na.rm = TRUE)

#Let's have another look at length of the metascore data
length(metascore_data)

[1] 100

Step 10: Scraping Gross earnings

# Using CSS selectors to scrape the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')

# Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)

# Let's have a look at the votes data
head(gross_data)

[1] "$325.10M" "$7.10M"   "$102.47M" "$138.29M" "$67.21M"  "$2.01M"

#Data-Preprocessing: removing '$' and 'M' signs
gross_data<-gsub("M","",gross_data)

gross_data<-substring(gross_data,2,6)

#Let's check the length of gross data
length(gross_data)

[1] 90

gross_data <- c(NA, gross_data)

#Filling missing entries with NA
for (i in c(4,32,35,40,44,48,65,70,97)){

a<-gross_data[1:(i-1)]
b<-gross_data[i:length(gross_data)]
gross_data<-append(a,list(NA))
gross_data<-append(gross_data,b)
}

#Data-Preprocessing: converting gross to numerical
gross_data<-as.numeric(gross_data, na.rm = TRUE)

#Let's have another look at the length of gross data
length(gross_data)

[1] 100

summary(gross_data)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.01   15.50   55.87   93.09  122.05  532.10      10

Step 11: Combining Features into a Dataframe

# Combining all the lists to form a data frame
movies_df<-data.frame(
  Rank = rank_data,
  Title = title_data,
  Description = description_data,
  Runtime = runtime_data,
  Genre = genre_data,
  Rating = rating_data,
  Metascore = metascore_data,
  Votes = votes_data,         
  Gross_Earning_in_Mil = gross_data,
  Director = directors_data
)

# Structure of the data frame

str(movies_df)

'data.frame':   100 obs. of  10 variables:
 $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
 $ Title               : chr  "Terrifier" "Suicide Squad" "Silence" "Hush" ...
 $ Description         : chr  "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ "In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is"| __truncated__ "A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence "| __truncated__ ...
 $ Runtime             : num  85 123 161 82 134 117 139 145 128 108 ...
 $ Genre               : Factor w/ 9 levels "Action","Adventure",..: 9 1 7 9 9 9 4 7 5 3 ...
 $ Rating              : num  5.6 5.9 7.2 6.6 7.3 7.3 8.1 8.1 8 7.1 ...
 $ Metascore           : num  NA 40 79 67 65 63 71 85 94 59 ...
 $ Votes               : num  47700 710221 119465 149262 292293 ...
 $ Gross_Earning_in_Mil: num  NA 325.1 7.1 NA 102.4 ...
 $ Director            : Factor w/ 97 levels "Alessandro Carloni",..: 19 22 61 65 44 59 63 71 18 34 ...

Visualizations

Graph 1

library('ggplot2')
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)

Graph 2

ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))

Graph 3

ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))

Questions

Question 1: Based on the above data, which movie from which Genre had the longest runtime?

The movie is titled Silence. The genre is Drama.

library(dplyr)
# Max runtime
max_runtime <- max(movies_df$Runtime, na.rm = TRUE)

# Find row
max_runtime_row <- movies_df |>
  filter(Runtime == max_runtime, !is.na(Runtime))

# Extract movie and genre
max_runtime_movie <- max_runtime_row |>
  select(Title, Genre)

print(max_runtime_movie)

    Title Genre
1 Silence Drama

Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

The genre with the highest votes between a runtime of 130-160 is the Action genre.

movies_q2 <- movies_df |>
  filter(Runtime >= 130 & Runtime <= 160) |>
  group_by(Genre) |>
  summarize(total_votes = sum(Votes, na.rm = TRUE)) |>
  arrange(desc(total_votes))

print(movies_q2)

# A tibble: 5 × 2
  Genre     total_votes
  <fct>           <dbl>
1 Action        3134975
2 Drama          765503
3 Adventure      593808
4 Biography      571863
5 Horror         292293

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

Across all genres the highest average gross earning genre between a runtime of 100-120 is the Animation Genre.

movies_q3 <- movies_df |>
  filter(Runtime >= 100 & Runtime <= 120) |>
  group_by(Genre) |>
  summarize(average_gross = mean(Gross_Earning_in_Mil, na.rm = TRUE),
  median_gross = median(Gross_Earning_in_Mil, na.rm = TRUE)) |>
  arrange(desc(average_gross))

print(movies_q3)

# A tibble: 8 × 3
  Genre     average_gross median_gross
  <fct>             <dbl>        <dbl>
1 Animation         216.        260.  
2 Adventure         125.         66.3 
3 Action             89.2        58.7 
4 Crime              51.2        51.2 
5 Drama              48.4        42.0 
6 Horror             46.8         1.33
7 Comedy             33.9        14.4 
8 Biography          28.7        27.9