Loading the rvest package

library('rvest')

Loading the tidyverse package

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Loading the plotly package

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Specifying the url for desired website to be scraped

url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

Reading the HTML code from the website

# webpage <- read_html(url)
# Initialize variables
success <- FALSE
max_retries <- 5  # set a maximum number of retries
count <- 0

while(!success & count < max_retries) {
  count <- count + 1
  tryCatch({
    # Attempt to read HTML
    webpage <- read_html(url)
    success <- TRUE  # if the above line doesn't error, set success to TRUE
  },
  error = function(e) {
    message(paste("Attempt", count, "failed with message:", e$message))
    Sys.sleep(5)  # sleep for 5 seconds before retrying (this can be adjusted)
  })
}

if(success) {
  message("HTML content successfully retrieved!")
} else {
  message("Failed to retrieve HTML content after multiple attempts.")
}
## HTML content successfully retrieved!

Load various elements and clean data using gsub.

Scrape for Movie Rank Information

Use the command, length, to ensure that each list contains 100 elements or NAs for missing data to sum to 100 elements.

#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

#Converting the ranking data to text
rank_data <- html_text(rank_data_html)

rank_data<-as.numeric(rank_data)

#Let's have a look at the rankings
head(rank_data)
## [1] 1 2 3 4 5 6
length(rank_data)
## [1] 100

Scrape for Title Information

#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)

#Let's have a look at the title
head(title_data)
## [1] "Terrifier"       "Suicide Squad"   "Silence"         "Hush"           
## [5] "The Conjuring 2" "Split"
title_data
##   [1] "Terrifier"                                  
##   [2] "Suicide Squad"                              
##   [3] "Silence"                                    
##   [4] "Hush"                                       
##   [5] "The Conjuring 2"                            
##   [6] "Split"                                      
##   [7] "Hacksaw Ridge"                              
##   [8] "The Handmaiden"                             
##   [9] "La La Land"                                 
##  [10] "Sing"                                       
##  [11] "Arrival"                                    
##  [12] "Moana"                                      
##  [13] "Trolls"                                     
##  [14] "Deadpool"                                   
##  [15] "Train to Busan"                             
##  [16] "The Nice Guys"                              
##  [17] "Rogue One: A Star Wars Story"               
##  [18] "Hell or High Water"                         
##  [19] "Miss Peregrine's Home for Peculiar Children"
##  [20] "The Autopsy of Jane Doe"                    
##  [21] "Zootopia"                                   
##  [22] "Your Name."                                 
##  [23] "The Magnificent Seven"                      
##  [24] "Nocturnal Animals"                          
##  [25] "Ghostbusters: Answer the Call"              
##  [26] "The Legend of Tarzan"                       
##  [27] "Me Before You"                              
##  [28] "Passengers"                                 
##  [29] "Ouija: Origin of Evil"                      
##  [30] "Hidden Figures"                             
##  [31] "Don't Breathe"                              
##  [32] "The Wailing"                                
##  [33] "Fantastic Beasts and Where to Find Them"    
##  [34] "Manchester by the Sea"                      
##  [35] "Before I Wake"                              
##  [36] "Lights Out"                                 
##  [37] "10 Cloverfield Lane"                        
##  [38] "Jack Reacher: Never Go Back"                
##  [39] "Sausage Party"                              
##  [40] "Five Nights at Freddy's 4"                  
##  [41] "Certain Women"                              
##  [42] "13 Hours"                                   
##  [43] "The Love Witch"                             
##  [44] "Love Machine"                               
##  [45] "Batman v Superman: Dawn of Justice"         
##  [46] "The Accountant"                             
##  [47] "Captain America: Civil War"                 
##  [48] "Below Her Mouth"                            
##  [49] "Independence Day: Resurgence"               
##  [50] "Gods of Egypt"                              
##  [51] "The Founder"                                
##  [52] "X-Men: Apocalypse"                          
##  [53] "The Neon Demon"                             
##  [54] "Moonlight"                                  
##  [55] "Doctor Strange"                             
##  [56] "Jason Bourne"                               
##  [57] "The Jungle Book"                            
##  [58] "War Dogs"                                   
##  [59] "Raw"                                        
##  [60] "My Big Fat Greek Wedding 2"                 
##  [61] "The Girl on the Train"                      
##  [62] "The Brothers Grimsby"                       
##  [63] "Captain Fantastic"                          
##  [64] "Warcraft"                                   
##  [65] "The Invisible Guest"                        
##  [66] "Pride and Prejudice and Zombies"            
##  [67] "The Belko Experiment"                       
##  [68] "A Cure for Wellness"                        
##  [69] "The BFG"                                    
##  [70] "The Whole Truth"                            
##  [71] "Free State of Jones"                        
##  [72] "Hunt for the Wilderpeople"                  
##  [73] "Lion"                                       
##  [74] "Gold"                                       
##  [75] "The Edge of Seventeen"                      
##  [76] "The Bad Batch"                              
##  [77] "Nerve"                                      
##  [78] "31"                                         
##  [79] "The 5th Wave"                               
##  [80] "Deepwater Horizon"                          
##  [81] "The Lost City of Z"                         
##  [82] "The Great Wall"                             
##  [83] "The Boy"                                    
##  [84] "Assassin's Creed"                           
##  [85] "Now You See Me 2"                           
##  [86] "Alice Through the Looking Glass"            
##  [87] "The Secret Life of Pets"                    
##  [88] "Bad Moms"                                   
##  [89] "Allied"                                     
##  [90] "Dirty Grandpa"                              
##  [91] "Mechanic: Resurrection"                     
##  [92] "Kung Fu Panda 3"                            
##  [93] "The Choice"                                 
##  [94] "The Shallows"                               
##  [95] "The Purge: Election Year"                   
##  [96] "The Void"                                   
##  [97] "The Midnight Man"                           
##  [98] "Star Trek Beyond"                           
##  [99] "Fences"                                     
## [100] "Sully"
length(title_data)
## [1] 100

Scrape for Movie Description Information

#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

#Let's have a look at the description data
head(description_data)
## [1] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                       
## [2] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [3] "\nIn the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is rumored to have committed apostasy, and to propagate Catholicism."  
## [4] "\nA deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence when a masked killer appears at her window."                         
## [5] "\nEd and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit."                                     
## [6] "\nThree girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

#Let's have another look at the description data 
head(description_data)
## [1] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                       
## [2] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [3] "In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is rumored to have committed apostasy, and to propagate Catholicism."  
## [4] "A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence when a masked killer appears at her window."                         
## [5] "Ed and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit."                                     
## [6] "Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."
length(description_data)
## [1] 100

Scrape for Movie Run Times

#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

#Let's have a look at the runtime
head(runtime_data)
## [1] "85 min"  "123 min" "161 min" "82 min"  "134 min" "117 min"
#Data-Preprocessing: removing mins and converting it to numerical

runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

#Let's have another look at the runtime data
head(runtime_data)
## [1]  85 123 161  82 134 117
length(runtime_data)
## [1] 100

Scrape for Movie Genre Information

#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

#Let's have a look at the runtime
head(genre_data)
## [1] "\nHorror, Thriller            "          
## [2] "\nAction, Adventure, Fantasy            "
## [3] "\nDrama, History            "            
## [4] "\nHorror, Thriller            "          
## [5] "\nHorror, Mystery, Thriller            " 
## [6] "\nHorror, Thriller            "
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

#Convering each genre from text to factor
genre_data<-as.factor(genre_data)

#Let's have another look at the genre data
head(genre_data)
## [1] Horror Action Drama  Horror Horror Horror
## 9 Levels: Action Adventure Animation Biography Comedy Crime Drama ... Horror
length(genre_data)
## [1] 100

Scrape for Movie Rating Information

This information changes as the webpage updates regularly

#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

#Let's have a look at the ratings
head(rating_data)
## [1] "5.6" "5.9" "7.2" "6.6" "7.3" "7.3"
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

#Let's have another look at the ratings data
head(rating_data)
## [1] 5.6 5.9 7.2 6.6 7.3 7.3
length(rating_data)
## [1] 100

Scrape for Voting Information

#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

#Converting the votes data to text
votes_data <- html_text(votes_data_html)

#Let's have a look at the votes data
head(votes_data)
## [1] "47,734"  "710,230" "119,468" "149,269" "292,311" "532,913"
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

#Let's have another look at the votes data
head(votes_data)
## [1]  47734 710230 119468 149269 292311 532913
length(votes_data)
## [1] 100

Scrape for Movie Director Information

#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

#Converting the directors data to text
directors_data <- html_text(directors_data_html)

#Let's have a look at the directors data
head(directors_data)
## [1] "Damien Leone"       "David Ayer"         "Martin Scorsese"   
## [4] "Mike Flanagan"      "James Wan"          "M. Night Shyamalan"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)
length(directors_data)
## [1] 100

Scrape for Movie Actor Information

#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

#Let's have a look at the actors data
head(actors_data)
## [1] "Jenna Kanell"       "Will Smith"         "Andrew Garfield"   
## [4] "John Gallagher Jr." "Vera Farmiga"       "James McAvoy"
#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)
length(actors_data)
## [1] 99
#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')

#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)

#Let's have a look at the metascore data 
head(metascore_data)
## [1] "40        " "79        " "67        " "65        " "63        "
## [6] "71        "
#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)

#Lets check the length of metascore data
length(metascore_data)
## [1] 95

Find metascore data with missing values and replace with NAs. I am using this automated method instead of the fallible method provided in the tutorial)

# scrape the ratings bar and convert to text
ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>%
 html_text2()

# look at the ratings bar
head(ratings_bar_data)
## [1] "5.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.6/10 X "              
## [2] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
## [3] "7.2\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.2/10 X \n79 Metascore"
## [4] "6.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 6.6/10 X \n67 Metascore"
## [5] "7.3\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.3/10 X \n65 Metascore"
## [6] "7.3\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.3/10 X \n63 Metascore"
# extract Metascore
metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%
 str_match("\\d{2}") %>%
 as.numeric() # convert to number

length(metascore_data)
## [1] 100
metascore_data
##   [1] NA 40 79 67 65 63 71 85 94 59 81 81 55 65 73 70 65 88 57 65 78 81 54 67 60
##  [26] 44 51 41 65 74 71 81 66 96 68 58 76 47 66 NA 82 48 82 NA 44 51 75 42 32 25
##  [51] 66 52 51 99 72 58 77 57 81 37 48 44 72 32 NA 45 44 47 66 46 53 81 69 49 77
##  [76] 62 58 35 33 68 78 42 42 36 46 34 61 60 60 21 38 66 26 59 55 62 NA 68 79 74
#Let's look at summary statistics
summary(metascore_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   47.00   62.00   60.36   72.50   99.00       5

Find the missing gross earnings (automated) Earnings are part of the votes bar in the

html, scrape the votes bar and extract earnings with a regular expression to get the NAs in context.

# scrape the votess bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
  html_text2()

head(votes_bar_data) # look at the votes bar data
## [1] "Votes: 47,734"                    "Votes: 710,230 | Gross: $325.10M"
## [3] "Votes: 119,468 | Gross: $7.10M"   "Votes: 149,269"                  
## [5] "Votes: 292,311 | Gross: $102.47M" "Votes: 532,913 | Gross: $138.29M"
gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign
gross_data <- substring(gross_data,2,6) %>% # clean data: remove '$' sign
 as.numeric()
length(gross_data)
## [1] 100

Data quality check

length(rank_data)
## [1] 100
length(title_data)
## [1] 100
length(description_data)
## [1] 100
length(runtime_data)
## [1] 100
length(genre_data)
## [1] 100
length(rating_data)
## [1] 100
length(metascore_data)
## [1] 100
length(votes_data)
## [1] 100
length(gross_data)
## [1] 100
length(directors_data)
## [1] 100

Combine all the lists to form a data frame

# Combining all the lists to form a data frame without actors_data

movies_df <- data.frame(Rank = rank_data, 
                        Title = title_data,
                        Description = description_data, 
                        Runtime = runtime_data,
                        Genre = genre_data, 
                        Rating = rating_data,
                        Metascore = metascore_data, 
                        Votes = votes_data,                                                             
                        Gross_Earning_in_Mil = gross_data,
                        Director = directors_data) # Removed Actor = actors_data

# Structure of the data frame

str(movies_df)
## 'data.frame':    100 obs. of  10 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr  "Terrifier" "Suicide Squad" "Silence" "Hush" ...
##  $ Description         : chr  "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ "In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is"| __truncated__ "A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence "| __truncated__ ...
##  $ Runtime             : num  85 123 161 82 134 117 139 145 128 108 ...
##  $ Genre               : Factor w/ 9 levels "Action","Adventure",..: 9 1 7 9 9 9 4 7 5 3 ...
##  $ Rating              : num  5.6 5.9 7.2 6.6 7.3 7.3 8.1 8.1 8 7.1 ...
##  $ Metascore           : num  NA 40 79 67 65 63 71 85 94 59 ...
##  $ Votes               : num  47734 710230 119468 149269 292311 ...
##  $ Gross_Earning_in_Mil: num  NA 325.1 7.1 NA 102.4 ...
##  $ Director            : Factor w/ 97 levels "Alessandro Carloni",..: 19 22 61 65 44 59 63 71 18 34 ...

Question 1: Based on the above data, which movie from which Genre had the longest runtime?

library(dplyr)

longest_runtime_movie <- movies_df |>
  arrange(desc(Runtime)) |>
  slice(1)  # or head(1)

cat(
  paste0(
    "Longest Runtime Movie Title: ",
    longest_runtime_movie$Title,
    "\n",
    "Longest Runtime: ",
    longest_runtime_movie$Runtime
  )
)
## Longest Runtime Movie Title: Silence
## Longest Runtime: 161
# Top Runtime movies

top_runtime_movie <- movies_df |>
  arrange(desc(Runtime)) |>
  slice_head(n = 10)


p1 <- top_runtime_movie |>
ggplot(aes(x = Genre, y = Runtime, fill = Title)) +   
  geom_bar(stat = "identity", position = "dodge", alpha = .65) +  
  labs(title = "Top 10 Runtime Movies and Genre",
       x = "Genre",        
       y = "Runtime(Minutes)",        
       fill = "Movie Title") +     
  theme_minimal() +   
  theme(axis.text.x = element_text(angle = 35, hjust = 1,),
        plot.title =  element_text(hjust = .5) )

ggplotly(p1)

Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

library(dplyr)

highest_votes <- movies_df |>
  filter(Runtime >= 130 & Runtime <= 160) |>
  group_by(Genre) |>
  summarise(total_votes = sum(Votes)) |>
  arrange(desc(total_votes)) |>
  slice(1)  # or head(1)

cat(
  paste0(
    "Highest Votes Movie Genre: ",
    highest_votes$Genre,
    "\n",
    "Highest Votes: ",
    highest_votes$total_votes
  )
)
## Highest Votes Movie Genre: Action
## Highest Votes: 3135008
p2 <- movies_df %>%
  filter(Runtime >= 130 & Runtime <= 160) %>%
  group_by(Genre) %>%
  summarise(total_votes = sum(Votes)) %>%
  arrange(desc(total_votes)) %>%
  ggplot(aes(y = Genre, x = total_votes, fill = Genre)) +
  geom_point(aes(size=total_votes,col=Genre), alpha = 0.7) +
  labs(title = "Highest votes by genre in the runtime of 130-160 mins",
       x = "Votes",
       y = "Genre") +
  theme_minimal()
p2

# Create bar chart
p2 <- movies_df %>%
  filter(Runtime >= 130 & Runtime <= 160) %>%
  group_by(Genre) %>%
  summarise(total_votes = sum(Votes)) %>%
  arrange(desc(total_votes)) %>%
  ggplot(aes(x = Genre, y = total_votes, fill = Genre)) +
  geom_bar(stat="identity", alpha = 0.7) +
  labs(title = "Highest votes by genre in the runtime of 130-160 mins",
       x = "Genre",
       y = "Votes") +
  theme_minimal()

# Convert to interactive plot using plotly
ggplotly(p2)

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

library(dplyr)

movies_df |>
  filter(Runtime >= 100 & Runtime <= 120) |>
  group_by(Genre) |>
  summarise(average_gross = mean(Gross_Earning_in_Mil, na.rm = TRUE)) |>
  arrange(desc(average_gross))
## # A tibble: 8 × 2
##   Genre     average_gross
##   <fct>             <dbl>
## 1 Animation         216. 
## 2 Adventure         125. 
## 3 Action             89.2
## 4 Crime              51.2
## 5 Drama              48.4
## 6 Horror             46.8
## 7 Comedy             33.9
## 8 Biography          28.7

Animation has the highest average gross earnings with 216.33000 in runtime 100 to 120.

p3 <- movies_df |>
  filter(Runtime >= 100 & Runtime <= 120) |>
  group_by(Genre) |>
  summarise(average_gross = mean(Gross_Earning_in_Mil, na.rm = TRUE)) |>
  arrange(desc(average_gross)) |>
ggplot(aes(x=Genre, y=average_gross, fill = Genre)) +
  geom_bar(stat="identity") +
  theme_minimal() +
  labs(title="Average Gross by Genre", x="Genre", y="Average Gross ($)") +
  theme(axis.text.x = element_text(angle=45, hjust=1))

ggplotly(p3)