#Specifying the url for desired website to be scrapedurl <-'https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100'#Reading the HTML code from the websitewebpage <-read_html(url)# save_url(webpage, filename="webpage.html")
Load various elements and clean data using gsub.
Scrape for Movie Rank Information
Use the command, length, to ensure that each list contains 100 elements or NAs for missing data to sum to 25 elements.
#Use CSS selectors to scrape the rankings sectionrank_title_html <-html_elements(webpage, css='.ipc-title__text')head(rank_title_html)
{xml_nodeset (6)}
[1] <h1 class="ipc-title__text">Advanced search</h1>
[2] <h3 class="ipc-title__text">1. Midsommar</h3>
[3] <h3 class="ipc-title__text">2. Once Upon a Time... in Hollywood</h3>
[4] <h3 class="ipc-title__text">3. The Gentlemen</h3>
[5] <h3 class="ipc-title__text">4. Avengers: Endgame</h3>
[6] <h3 class="ipc-title__text">5. Parasite</h3>
#Convert the ranking data to textrank_title <-html_text(rank_title_html)#Remove the first and last rows - they are not movie titlesrank_title_data <- rank_title[-c(1,27)]#Let's have a look at the rankingstail(rank_title_data)
[1] "20. Uncut Gems" "21. Glass"
[3] "22. The Lion King" "23. Terminator: Dark Fate"
[5] "24. Dark Waters" "25. The King"
# notice that the format is "rank. title"length(rank_title_data)
[1] 25
#should be 25
Scrape for Rank Information from the rank_title information
# remove the title and extract just the numberrank_data <-parse_number(rank_title_data)summary(rank_data)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1 7 13 13 19 25
Scrape for Title Information
# Use the rank_title_data and extract just the characters from the titletitle_data <-str_sub(rank_title_data, start = 4L, end =-1L)head(title_data) #check first 6 titles
[1] "Midsommar" "Once Upon a Time... in Hollywood"
[3] "The Gentlemen" "Avengers: Endgame"
[5] "Parasite" "1917"
length(title_data) # check number of titles - should be 100
[1] 25
Scrape for Movie Description Information
#Use CSS selectors to scrape the description sectiondescription_data_html <-html_elements(webpage, css='.ipc-html-content-inner-div')#Convert the description data to textdescription_data <-html_text(description_data_html)#Let's have a look at the description datahead(description_data)
[1] "A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult."
[2] "As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way."
[3] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."
[4] "After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."
[5] "Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."
[6] "April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap."
#What is the length of this vector for description data length(description_data)
[1] 25
#It should be 25
Scrape for details information
span.sc-b189961a-8.kLaxqf.dli-title-metadata-item
#Use CSS selectors to scrape the Movie runtime details_data_html <-html_elements(webpage, css ='span.sc-b189961a-8.kLaxqf.dli-title-metadata-item')#Convert the description data to textdetails_data <-html_text(details_data_html)head(details_data)
[1] "2019" "2h 28m" "R" "2019" "2h 41m" "R"
Details include year, runtime, and rating
Filter just for the runtime
# Filter out the movie runtimes in the form "Xh XXm" from details_dataruntime_text <- details_data[grep("\\d+h", details_data)]head(runtime_text)
Convert runtime_text from hours and minutes to minutes
# Convert runtime_text from hours and minutes to minutesconverted_runtimes <-sapply(strsplit(runtime_text, "h |m"), function(x) as.numeric(x[1]) *60+as.numeric(x[2]))# Display the converted movie runtimeshead(converted_runtimes)
[1] 148 161 113 181 132 119
length(converted_runtimes)
[1] 25
summary(converted_runtimes)
Min. 1st Qu. Median Mean 3rd Qu. Max.
95.0 116.0 126.0 127.2 135.0 181.0
Check to make sure movies match with runtimes with a temporary data frame
# Display the titles of movies with missing runtimes and their corresponding runtimesdf_1 <-data.frame(Title = title_data, Runtime = converted_runtimes)head(df_1)
Title Runtime
1 Midsommar 148
2 Once Upon a Time... in Hollywood 161
3 The Gentlemen 113
4 Avengers: Endgame 181
5 Parasite 132
6 1917 119
Scrape for Voting Information
# Use CSS selectors to scrape the number of votesvotes_labels <-html_nodes(webpage, css ="span.ipc-rating-star--voteCount")vote_text <-html_text(votes_labels)head(vote_text)
# Display the movies with missing or invalid runtimesdf_movies <-data.frame(rank = rank_data, title = title_data, description = description_data, runtime = converted_runtimes, votes = vote_text)head(df_movies)
rank title
1 1 Midsommar
2 2 Once Upon a Time... in Hollywood
3 3 The Gentlemen
4 4 Avengers: Endgame
5 5 Parasite
6 6 1917
description
1 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
3 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
4 After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
5 Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
6 April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
runtime votes
1 148 411K
2 161 859K
3 113 408K
4 181 1.3M
5 132 974K
6 119 685K
rank title
1 1 Midsommar
2 2 Once Upon a Time... in Hollywood
3 3 The Gentlemen
4 4 Avengers: Endgame
5 5 Parasite
6 6 1917
description
1 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
3 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
4 After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
5 Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
6 April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
runtime votes votes_thous
1 148 411K 411
2 161 859K 859
3 113 408K 408
4 181 1.3M 13000
5 132 974K 974
6 119 685K 685
rank title
1 1 Midsommar
2 2 Once Upon a Time... in Hollywood
3 3 The Gentlemen
4 4 Avengers: Endgame
5 5 Parasite
6 6 1917
description
1 A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
3 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
4 After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
5 Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
6 April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
runtime votes votes_thous votes_in_thous
1 148 411K 411 411
2 161 859K 859 859
3 113 408K 408 408
4 181 1.3M 13000 13000
5 132 974K 974 974
6 119 685K 685 685
Problem 1: Based on the scraped 2019 IMDB movie data frame, create a histogram that shows runtime on the x-axis. Be sure to provide a title, axis label, and caption for the data source.
Alternatively you may create a scatterplot of runtime versus number of votes.
## ggplotlibrary(ggplot2)# Creating the histogramggplot(df_movies, aes(x = runtime)) +geom_histogram(binwidth =10, fill ="blue", color ="black", alpha =0.7) +labs(title ="Distribution of Movie Runtimes (2019)",x ="Runtime (minutes)",y ="Frequency",caption ="Data Source: IMDB" ) +theme_minimal()
# Creating the scatter plotggplot(df2, aes(x = runtime, y = votes_in_thous)) +geom_point(color ="red", alpha =0.6) +labs(title ="Runtime vs. Number of Votes for Movies (2019)",x ="Runtime (minutes)",y ="Number of Votes (in thousands)",caption ="Data Source: IMDB" ) +theme_minimal()
Problem 2: Use the filter function to answer the following question?
Which movies had a runtime 100-150 minutes, in the top rated 5 movies. Be sure to state the rank and runtime for each movie.
library(dplyr)# Filter movies with runtime between 100 and 150 minutes and in the top 5 rated moviesfiltered_movies <- df_movies %>%filter(runtime >=100& runtime <=150& rank <=5)filtered_movies_selected <- filtered_movies %>%select(rank, title, runtime)print(filtered_movies_selected)
rank title runtime
1 1 Midsommar 148
2 3 The Gentlemen 113
3 5 Parasite 132
Problem 3: In the runtime of 116-135 mins, which movies are from the lowest ranked 5 out of 25?
Again, you must use the filter function to get the exact movie which answers this question. Be sure to state the rank and runtime for each movie.