#install.packages('rvest')
#Loading the rvest package
library(rvest)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100
#Specifying the url for desired website to be scraped
url <- 'https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100'
#Reading the HTML code from the website
webpage <- read_html(url)
# save_url(webpage, filename="webpage.html")
Use the command, length, to ensure that each list contains 100 elements or NAs for missing data to sum to 25 elements.
#Use CSS selectors to scrape the rankings section
rank_title_html <- html_elements(webpage, css='.ipc-title__text')
head(rank_title_html)
## {xml_nodeset (6)}
## [1] <h1 class="ipc-title__text">Advanced search</h1>
## [2] <h3 class="ipc-title__text">1. Midsommar</h3>
## [3] <h3 class="ipc-title__text">2. Once Upon a Time... in Hollywood</h3>
## [4] <h3 class="ipc-title__text">3. The Gentlemen</h3>
## [5] <h3 class="ipc-title__text">4. Avengers: Endgame</h3>
## [6] <h3 class="ipc-title__text">5. Parasite</h3>
#Convert the ranking data to text
rank_title <- html_text(rank_title_html)
#Remove the first and last rows - they are not movie titles
rank_title_data <- rank_title[-c(1,27)]
#Let's have a look at the rankings
tail(rank_title_data)
## [1] "20. Uncut Gems" "21. Glass"
## [3] "22. The Lion King" "23. Terminator: Dark Fate"
## [5] "24. Dark Waters" "25. The King"
# notice that the format is "rank. title"
length(rank_title_data)
## [1] 25
#should be 25
# remove the title and extract just the number
rank_data <- parse_number(rank_title_data)
summary(rank_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 7 13 13 19 25
# Use the rank_title_data and extract just the characters from the title
title_data <- str_sub(rank_title_data, start = 4L, end = -1L)
head(title_data) #check first 6 titles
## [1] "Midsommar" "Once Upon a Time... in Hollywood"
## [3] "The Gentlemen" "Avengers: Endgame"
## [5] "Parasite" "1917"
length(title_data) # check number of titles - should be 100
## [1] 25
#Use CSS selectors to scrape the description section
description_data_html <- html_elements(webpage, css='.ipc-html-content-inner-div')
#Convert the description data to text
description_data <- html_text(description_data_html)
#Let's have a look at the description data
head(description_data)
## [1] "A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult."
## [2] "As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way."
## [3] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."
## [4] "After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."
## [5] "Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."
## [6] "April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap."
#What is the length of this vector for description data
length(description_data)
## [1] 25
#It should be 25
span.sc-b189961a-8.kLaxqf.dli-title-metadata-item
#Use CSS selectors to scrape the Movie runtime
details_data_html <- html_elements(webpage, css = 'span.sc-b189961a-8.kLaxqf.dli-title-metadata-item')
#Convert the description data to text
details_data <- html_text(details_data_html)
head(details_data)
## [1] "2019" "2h 28m" "R" "2019" "2h 41m" "R"
# Filter out the movie runtimes in the form "Xh XXm" from details_data
runtime_text <- details_data[grep("\\d+h", details_data)]
head(runtime_text)
## [1] "2h 28m" "2h 41m" "1h 53m" "3h 1m" "2h 12m" "1h 59m"
# Convert runtime_text from hours and minutes to minutes
converted_runtimes <- sapply(strsplit(runtime_text, "h |m"), function(x) as.numeric(x[1]) * 60 + as.numeric(x[2]))
# Display the converted movie runtimes
head(converted_runtimes)
## [1] 148 161 113 181 132 119
length(converted_runtimes)
## [1] 25
summary(converted_runtimes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 95.0 116.0 126.0 127.2 135.0 181.0
# Display the titles of movies with missing runtimes and their corresponding runtimes
df_1 <- data.frame(Title = title_data, Runtime = converted_runtimes)
head(df_1)
## Title Runtime
## 1 Midsommar 148
## 2 Once Upon a Time... in Hollywood 161
## 3 The Gentlemen 113
## 4 Avengers: Endgame 181
## 5 Parasite 132
## 6 1917 119
# Use CSS selectors to scrape the number of votes
votes_labels <- html_nodes(webpage, css = "span.ipc-rating-star--voteCount")
vote_text <- html_text(votes_labels)
head(vote_text)
## [1] " (411K)" " (859K)" " (409K)" " (1.3M)" " (974K)" " (685K)"
Remove parenthesis around vote_text
vote_text <- str_replace_all(vote_text, "[()]", "")
vote_text <- str_replace_all(vote_text, " ", "")
head(vote_text)
## [1] " 411K" " 859K" " 409K" " 1.3M" " 974K" " 685K"
# Display the movies with missing or invalid runtimes
df_movies <- data.frame(rank = rank_data, title = title_data, description = description_data, runtime = converted_runtimes, votes = vote_text)
head(df_movies)
## rank title
## 1 1 Midsommar
## 2 2 Once Upon a Time... in Hollywood
## 3 3 The Gentlemen
## 4 4 Avengers: Endgame
## 5 5 Parasite
## 6 6 1917
## description
## 1 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
## 2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
## 3 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
## 4 After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
## 5 Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
## 6 April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
## runtime votes
## 1 148 411K
## 2 161 859K
## 3 113 409K
## 4 181 1.3M
## 5 132 974K
## 6 119 685K
M stands for millions K stands for thousands
df2 <- df_movies |>
mutate(votes_thous = gsub("K", "", votes)) |>
mutate(votes_thous = gsub("M", "000", votes_thous)) |>
mutate(votes_thous = gsub("[.]", "", votes_thous))
head(df2)
## rank title
## 1 1 Midsommar
## 2 2 Once Upon a Time... in Hollywood
## 3 3 The Gentlemen
## 4 4 Avengers: Endgame
## 5 5 Parasite
## 6 6 1917
## description
## 1 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
## 2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
## 3 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
## 4 After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
## 5 Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
## 6 April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
## runtime votes votes_thous
## 1 148 411K 411
## 2 161 859K 859
## 3 113 409K 409
## 4 181 1.3M 13000
## 5 132 974K 974
## 6 119 685K 685
df2$votes_in_thous <- parse_number(df2$votes_thous)
head(df2)
## rank title
## 1 1 Midsommar
## 2 2 Once Upon a Time... in Hollywood
## 3 3 The Gentlemen
## 4 4 Avengers: Endgame
## 5 5 Parasite
## 6 6 1917
## description
## 1 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
## 2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
## 3 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
## 4 After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
## 5 Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
## 6 April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
## runtime votes votes_thous votes_in_thous
## 1 148 411K 411 411
## 2 161 859K 859 859
## 3 113 409K 409 409
## 4 181 1.3M 13000 13000
## 5 132 974K 974 974
## 6 119 685K 685 685
Alternatively you may create a scatterplot of runtime versus number of votes.
library(ggplot2)
ggplot(df2, aes(x = runtime)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Distribution of Movie Runtimes",
x = "Runtime (minutes)",
y = "Frequency") +
theme_minimal()
Which movies had a runtime 100-150 minutes, in the top rated 5 movies. Be sure to state the rank and runtime for each movie.
filtered_movies <- df2 %>%
filter(runtime >= 100 & runtime <= 150)
head(filtered_movies)
## rank title
## 1 1 Midsommar
## 2 3 The Gentlemen
## 3 5 Parasite
## 4 6 1917
## 5 7 Joker
## 6 8 Anna
## description
## 1 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
## 2 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
## 3 Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
## 4 April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
## 5 Arthur Fleck, a party clown and a failed stand-up comedian, leads an impoverished life with his ailing mother. However, when society shuns him and brands him as a freak, he decides to embrace the life of crime and chaos in Gotham City.
## 6 Beneath Anna Poliatova's striking beauty lies a secret that will unleash her indelible strength and skill to become one of the world's most feared government assassins.
## runtime votes votes_thous votes_in_thous
## 1 148 411K 411 411
## 2 113 409K 409 409
## 3 132 974K 974 974
## 4 119 685K 685 685
## 5 122 1.5M 15000 15000
## 6 118 97K 97 97
Again, you must use the filter function to get the exact movie which answers this question. Be sure to state the rank and runtime for each movie.
filtered_movies2 <- df2 %>%
filter(runtime >= 116 & runtime <= 135) %>%
arrange(runtime) %>%
slice(1:5)
head(filtered_movies2)
## rank title
## 1 16 Us
## 2 8 Anna
## 3 22 The Lion King
## 4 6 1917
## 5 7 Joker
## description
## 1 Adelaide Wilson and her family are attacked by mysterious figures dressed in red. Upon closer inspection, the Wilsons realize that the intruders are exact lookalikes of them.
## 2 Beneath Anna Poliatova's striking beauty lies a secret that will unleash her indelible strength and skill to become one of the world's most feared government assassins.
## 3 After the murder of his father, a young lion prince flees his kingdom only to learn the true meaning of responsibility and bravery.
## 4 April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
## 5 Arthur Fleck, a party clown and a failed stand-up comedian, leads an impoverished life with his ailing mother. However, when society shuns him and brands him as a freak, he decides to embrace the life of crime and chaos in Gotham City.
## runtime votes votes_thous votes_in_thous
## 1 116 345K 345 345
## 2 118 97K 97 97
## 3 118 269K 269 269
## 4 119 685K 685 685
## 5 122 1.5M 15000 15000