Webscraping IMDB tutorial

Author

Aaron T

Install necessary packages for this project

#install.packages('rvest')
#Loading the rvest package
library(rvest)
Warning: package 'rvest' was built under R version 4.3.3
library(tidyverse)
Warning: package 'ggplot2' was built under R version 4.3.3
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter()         masks stats::filter()
✖ readr::guess_encoding() masks rvest::guess_encoding()
✖ dplyr::lag()            masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Scrape the IMDB website to create a dataframe of information from 2019 top 100 movies

Use the following URL from IMBD movies of 2019

https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100

#Specifying the url for desired website to be scraped
url <- 'https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100'

#Reading the HTML code from the website
webpage <- read_html(url)
# save_url(webpage, filename="webpage.html")

Load various elements and clean data using gsub.

Scrape for Movie Rank Information

Use the command, length, to ensure that each list contains 100 elements or NAs for missing data to sum to 25 elements.

#Use CSS selectors to scrape the rankings section
rank_title_html <- html_elements(webpage, css='.ipc-title__text')
head(rank_title_html)
{xml_nodeset (6)}
[1] <h1 class="ipc-title__text">Advanced search</h1>
[2] <h3 class="ipc-title__text">1. Midsommar</h3>
[3] <h3 class="ipc-title__text">2. Once Upon a Time... in Hollywood</h3>
[4] <h3 class="ipc-title__text">3. The Gentlemen</h3>
[5] <h3 class="ipc-title__text">4. Avengers: Endgame</h3>
[6] <h3 class="ipc-title__text">5. Parasite</h3>
#Convert the ranking data to text
rank_title <- html_text(rank_title_html)

#Remove the first and last rows - they are not movie titles
rank_title_data <- rank_title[-c(1,27)]

#Let's have a look at the rankings
tail(rank_title_data)
[1] "20. Uncut Gems"            "21. Glass"                
[3] "22. The Lion King"         "23. Terminator: Dark Fate"
[5] "24. Dark Waters"           "25. The King"             
# notice that the format is "rank. title"
length(rank_title_data)
[1] 25
#should be 25

Scrape for Rank Information from the rank_title information

# remove the title and extract just the number
rank_data <- parse_number(rank_title_data)
summary(rank_data)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1       7      13      13      19      25 

Scrape for Title Information

# Use the rank_title_data and extract just the characters from the title
title_data <- str_sub(rank_title_data, start = 4L, end = -1L)

head(title_data)      #check first 6 titles
[1] "Midsommar"                        "Once Upon a Time... in Hollywood"
[3] "The Gentlemen"                    "Avengers: Endgame"               
[5] "Parasite"                         "1917"                            
length(title_data)     # check number of titles - should be 100
[1] 25

Scrape for Movie Description Information

#Use CSS selectors to scrape the description section
description_data_html <- html_elements(webpage, css='.ipc-html-content-inner-div')

#Convert the description data to text
description_data <- html_text(description_data_html)

#Let's have a look at the description data
head(description_data)
[1] "A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult."   
[2] "As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way."
[3] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."                                                
[4] "After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."        
[5] "Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."                                                                                                 
[6] "April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap."            
#What is the length of this vector for description data 
length(description_data)
[1] 25
#It should be 25

Scrape for details information

span.sc-b189961a-8.kLaxqf.dli-title-metadata-item

#Use CSS selectors to scrape the Movie runtime 
details_data_html <- html_elements(webpage, css = 'span.sc-b189961a-8.kLaxqf.dli-title-metadata-item')

#Convert the description data to text
details_data <- html_text(details_data_html)
head(details_data)
[1] "2019"   "2h 28m" "R"      "2019"   "2h 41m" "R"     

Details include year, runtime, and rating

Filter just for the runtime

# Filter out the movie runtimes in the form "Xh XXm" from details_data
runtime_text <- details_data[grep("\\d+h", details_data)]
head(runtime_text)
[1] "2h 28m" "2h 41m" "1h 53m" "3h 1m"  "2h 12m" "1h 59m"

Convert runtime_text from hours and minutes to minutes

# Convert runtime_text from hours and minutes to minutes
converted_runtimes <- sapply(strsplit(runtime_text, "h |m"), function(x) as.numeric(x[1]) * 60 + as.numeric(x[2]))

# Display the converted movie runtimes
head(converted_runtimes)
[1] 148 161 113 181 132 119
length(converted_runtimes)
[1] 25
summary(converted_runtimes)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   95.0   116.0   126.0   127.2   135.0   181.0 

Check to make sure movies match with runtimes with a temporary data frame

# Display the titles of movies with missing runtimes and their corresponding runtimes
df_1 <- data.frame(Title = title_data, Runtime = converted_runtimes)
head(df_1)
                             Title Runtime
1                        Midsommar     148
2 Once Upon a Time... in Hollywood     161
3                    The Gentlemen     113
4                Avengers: Endgame     181
5                         Parasite     132
6                             1917     119

Scrape for Voting Information

# Use CSS selectors to scrape the number of votes
votes_labels <- html_nodes(webpage, css = "span.ipc-rating-star--voteCount")
vote_text <- html_text(votes_labels)
head(vote_text)
[1] " (411K)" " (859K)" " (408K)" " (1.3M)" " (974K)" " (685K)"

Clean votes character strings

Remove parenthesis around vote_text

vote_text <- str_replace_all(vote_text, "[()]", "")
vote_text <- str_replace_all(vote_text, " ", "")
head(vote_text)
[1] " 411K" " 859K" " 408K" " 1.3M" " 974K" " 685K"

Combine all the lists to form a data frame

# Display the movies with missing or invalid runtimes
df_movies <- data.frame(rank = rank_data, title = title_data, description = description_data, runtime = converted_runtimes, votes = vote_text)
head(df_movies)
  rank                            title
1    1                        Midsommar
2    2 Once Upon a Time... in Hollywood
3    3                    The Gentlemen
4    4                Avengers: Endgame
5    5                         Parasite
6    6                             1917
                                                                                                                                                                                                                                   description
1    A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
3                                                 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
4         After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
5                                                                                                  Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
6             April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
  runtime votes
1     148  411K
2     161  859K
3     113  408K
4     181  1.3M
5     132  974K
6     119  685K

Convert M and K number of votes

M stands for millions K stands for thousands

df2 <- df_movies |>
  mutate(votes_thous = gsub("K", "", votes)) |>
  mutate(votes_thous = gsub("M", "000", votes_thous)) |>
  mutate(votes_thous = gsub("[.]", "", votes_thous))
head(df2)
  rank                            title
1    1                        Midsommar
2    2 Once Upon a Time... in Hollywood
3    3                    The Gentlemen
4    4                Avengers: Endgame
5    5                         Parasite
6    6                             1917
                                                                                                                                                                                                                                   description
1    A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
3                                                 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
4         After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
5                                                                                                  Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
6             April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
  runtime votes votes_thous
1     148  411K         411
2     161  859K         859
3     113  408K         408
4     181  1.3M       13000
5     132  974K         974
6     119  685K         685
df2$votes_in_thous <- parse_number(df2$votes_thous)
head(df2)
  rank                            title
1    1                        Midsommar
2    2 Once Upon a Time... in Hollywood
3    3                    The Gentlemen
4    4                Avengers: Endgame
5    5                         Parasite
6    6                             1917
                                                                                                                                                                                                                                   description
1    A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
3                                                 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
4         After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
5                                                                                                  Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
6             April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
  runtime votes votes_thous votes_in_thous
1     148  411K         411            411
2     161  859K         859            859
3     113  408K         408            408
4     181  1.3M       13000          13000
5     132  974K         974            974
6     119  685K         685            685

Problem 1: Based on the scraped 2019 IMDB movie data frame, create a histogram that shows runtime on the x-axis. Be sure to provide a title, axis label, and caption for the data source.

Alternatively you may create a scatterplot of runtime versus number of votes.

## ggplot
library(ggplot2)

# Creating the histogram
ggplot(df_movies, aes(x = runtime)) +
  geom_histogram(binwidth = 10, fill = "blue", color = "black", alpha = 0.7) +
  labs(
    title = "Distribution of Movie Runtimes (2019)",
    x = "Runtime (minutes)",
    y = "Frequency",
    caption = "Data Source: IMDB"
  ) +
  theme_minimal()

# Creating the scatter plot
ggplot(df2, aes(x = runtime, y = votes_in_thous)) +
  geom_point(color = "red", alpha = 0.6) +
  labs(
    title = "Runtime vs. Number of Votes for Movies (2019)",
    x = "Runtime (minutes)",
    y = "Number of Votes (in thousands)",
    caption = "Data Source: IMDB"
  ) +
  theme_minimal()

Problem 2: Use the filter function to answer the following question?

Which movies had a runtime 100-150 minutes, in the top rated 5 movies. Be sure to state the rank and runtime for each movie.

library(dplyr)

# Filter movies with runtime between 100 and 150 minutes and in the top 5 rated movies
filtered_movies <- df_movies %>%
  filter(runtime >= 100 & runtime <= 150 & rank <= 5)


filtered_movies_selected <- filtered_movies %>%
  select(rank, title, runtime)


print(filtered_movies_selected)
  rank         title runtime
1    1     Midsommar     148
2    3 The Gentlemen     113
3    5      Parasite     132

Problem 3: In the runtime of 116-135 mins, which movies are from the lowest ranked 5 out of 25?

Again, you must use the filter function to get the exact movie which answers this question. Be sure to state the rank and runtime for each movie.

lowest_ranked_movies <- df_movies %>%
  filter(rank > 20 & rank <= 25) %>%
  filter(runtime >= 116 & runtime <= 135)

lowest_ranked_movies_selected <- lowest_ranked_movies %>%
  select(rank, title, runtime)

print(lowest_ranked_movies_selected)
  rank                  title runtime
1   21                  Glass     129
2   22          The Lion King     118
3   23  Terminator: Dark Fate     128
4   24            Dark Waters     126