Install necessary packages for this project

#install.packages('rvest')
#Loading the rvest package
library(rvest)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Scrape the IMDB website to create a dataframe of information from 2019 top 100 movies

Use the following URL from IMBD movies of 2019

https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100

#Specifying the url for desired website to be scraped
url <- 'https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count=100'

#Reading the HTML code from the website
webpage <- read_html(url)
# save_url(webpage, filename="webpage.html")

Load various elements and clean data using gsub.

Scrape for Movie Rank Information

Use the command, length, to ensure that each list contains 100 elements or NAs for missing data to sum to 25 elements.

#Use CSS selectors to scrape the rankings section
rank_title_html <- html_elements(webpage, css='.ipc-title__text')
head(rank_title_html)
## {xml_nodeset (6)}
## [1] <h1 class="ipc-title__text">Advanced search</h1>
## [2] <h3 class="ipc-title__text">1. Midsommar</h3>
## [3] <h3 class="ipc-title__text">2. Once Upon a Time... in Hollywood</h3>
## [4] <h3 class="ipc-title__text">3. The Gentlemen</h3>
## [5] <h3 class="ipc-title__text">4. Avengers: Endgame</h3>
## [6] <h3 class="ipc-title__text">5. Parasite</h3>
#Convert the ranking data to text
rank_title <- html_text(rank_title_html)

#Remove the first and last rows - they are not movie titles
rank_title_data <- rank_title[-c(1,27)]

#Let's have a look at the rankings
tail(rank_title_data)
## [1] "20. Uncut Gems"            "21. Glass"                
## [3] "22. The Lion King"         "23. Terminator: Dark Fate"
## [5] "24. Dark Waters"           "25. The King"
# notice that the format is "rank. title"
length(rank_title_data)
## [1] 25
#should be 25

Scrape for Rank Information from the rank_title information

# remove the title and extract just the number
rank_data <- parse_number(rank_title_data)
summary(rank_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       7      13      13      19      25

Scrape for Title Information

# Use the rank_title_data and extract just the characters from the title
title_data <- str_sub(rank_title_data, start = 4L, end = -1L)

head(title_data)      #check first 6 titles
## [1] "Midsommar"                        "Once Upon a Time... in Hollywood"
## [3] "The Gentlemen"                    "Avengers: Endgame"               
## [5] "Parasite"                         "1917"
length(title_data)     # check number of titles - should be 100
## [1] 25

Scrape for Movie Description Information

#Use CSS selectors to scrape the description section
description_data_html <- html_elements(webpage, css='.ipc-html-content-inner-div')

#Convert the description data to text
description_data <- html_text(description_data_html)

#Let's have a look at the description data
head(description_data)
## [1] "A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult."   
## [2] "As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way."
## [3] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."                                                
## [4] "After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."        
## [5] "Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."                                                                                                 
## [6] "April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap."
#What is the length of this vector for description data 
length(description_data)
## [1] 25
#It should be 25

Scrape for details information

span.sc-b189961a-8.kLaxqf.dli-title-metadata-item

#Use CSS selectors to scrape the Movie runtime 
details_data_html <- html_elements(webpage, css = 'span.sc-b189961a-8.kLaxqf.dli-title-metadata-item')

#Convert the description data to text
details_data <- html_text(details_data_html)
head(details_data)
## [1] "2019"   "2h 28m" "R"      "2019"   "2h 41m" "R"

Details include year, runtime, and rating

Filter just for the runtime

# Filter out the movie runtimes in the form "Xh XXm" from details_data
runtime_text <- details_data[grep("\\d+h", details_data)]
head(runtime_text)
## [1] "2h 28m" "2h 41m" "1h 53m" "3h 1m"  "2h 12m" "1h 59m"

Convert runtime_text from hours and minutes to minutes

# Convert runtime_text from hours and minutes to minutes
converted_runtimes <- sapply(strsplit(runtime_text, "h |m"), function(x) as.numeric(x[1]) * 60 + as.numeric(x[2]))

# Display the converted movie runtimes
head(converted_runtimes)
## [1] 148 161 113 181 132 119
length(converted_runtimes)
## [1] 25
summary(converted_runtimes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    95.0   116.0   126.0   127.2   135.0   181.0

Check to make sure movies match with runtimes with a temporary data frame

# Display the titles of movies with missing runtimes and their corresponding runtimes
df_1 <- data.frame(Title = title_data, Runtime = converted_runtimes)
head(df_1)
##                              Title Runtime
## 1                        Midsommar     148
## 2 Once Upon a Time... in Hollywood     161
## 3                    The Gentlemen     113
## 4                Avengers: Endgame     181
## 5                         Parasite     132
## 6                             1917     119

Scrape for Voting Information

# Use CSS selectors to scrape the number of votes
votes_labels <- html_nodes(webpage, css = "span.ipc-rating-star--voteCount")
vote_text <- html_text(votes_labels)
head(vote_text)
## [1] " (411K)" " (859K)" " (409K)" " (1.3M)" " (974K)" " (685K)"

Clean votes character strings

Remove parenthesis around vote_text

vote_text <- str_replace_all(vote_text, "[()]", "")
vote_text <- str_replace_all(vote_text, " ", "")
head(vote_text)
## [1] " 411K" " 859K" " 409K" " 1.3M" " 974K" " 685K"

Combine all the lists to form a data frame

# Display the movies with missing or invalid runtimes
df_movies <- data.frame(rank = rank_data, title = title_data, description = description_data, runtime = converted_runtimes, votes = vote_text)
head(df_movies)
##   rank                            title
## 1    1                        Midsommar
## 2    2 Once Upon a Time... in Hollywood
## 3    3                    The Gentlemen
## 4    4                Avengers: Endgame
## 5    5                         Parasite
## 6    6                             1917
##                                                                                                                                                                                                                                    description
## 1    A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
## 2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
## 3                                                 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
## 4         After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
## 5                                                                                                  Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
## 6             April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
##   runtime votes
## 1     148  411K
## 2     161  859K
## 3     113  409K
## 4     181  1.3M
## 5     132  974K
## 6     119  685K

Convert M and K number of votes

M stands for millions K stands for thousands

df2 <- df_movies |>
  mutate(votes_thous = gsub("K", "", votes)) |>
  mutate(votes_thous = gsub("M", "000", votes_thous)) |>
  mutate(votes_thous = gsub("[.]", "", votes_thous))
head(df2)
##   rank                            title
## 1    1                        Midsommar
## 2    2 Once Upon a Time... in Hollywood
## 3    3                    The Gentlemen
## 4    4                Avengers: Endgame
## 5    5                         Parasite
## 6    6                             1917
##                                                                                                                                                                                                                                    description
## 1    A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
## 2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
## 3                                                 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
## 4         After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
## 5                                                                                                  Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
## 6             April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
##   runtime votes votes_thous
## 1     148  411K         411
## 2     161  859K         859
## 3     113  409K         409
## 4     181  1.3M       13000
## 5     132  974K         974
## 6     119  685K         685
df2$votes_in_thous <- parse_number(df2$votes_thous)
head(df2)
##   rank                            title
## 1    1                        Midsommar
## 2    2 Once Upon a Time... in Hollywood
## 3    3                    The Gentlemen
## 4    4                Avengers: Endgame
## 5    5                         Parasite
## 6    6                             1917
##                                                                                                                                                                                                                                    description
## 1    A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
## 2 As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt double Cliff Booth endeavor to achieve lasting success in Hollywood while meeting several colorful characters along the way.
## 3                                                 An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
## 4         After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
## 5                                                                                                  Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
## 6             April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
##   runtime votes votes_thous votes_in_thous
## 1     148  411K         411            411
## 2     161  859K         859            859
## 3     113  409K         409            409
## 4     181  1.3M       13000          13000
## 5     132  974K         974            974
## 6     119  685K         685            685

Problem 1: Based on the scraped 2019 IMDB movie data frame, create a histogram that shows runtime on the x-axis. Be sure to provide a title, axis label, and caption for the data source.

Alternatively you may create a scatterplot of runtime versus number of votes.

library(ggplot2)
ggplot(df2, aes(x = runtime)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Distribution of Movie Runtimes",
       x = "Runtime (minutes)",
       y = "Frequency") +
  theme_minimal()

Problem 2: Use the filter function to answer the following question?

Which movies had a runtime 100-150 minutes, in the top rated 5 movies. Be sure to state the rank and runtime for each movie.

filtered_movies <- df2 %>%
  filter(runtime >= 100 & runtime <= 150)
head(filtered_movies)
##   rank         title
## 1    1     Midsommar
## 2    3 The Gentlemen
## 3    5      Parasite
## 4    6          1917
## 5    7         Joker
## 6    8          Anna
##                                                                                                                                                                                                                                   description
## 1   A couple travels to Northern Europe to visit a rural hometown's fabled Swedish mid-summer festival. What begins as an idyllic retreat quickly devolves into an increasingly violent and bizarre competition at the hands of a pagan cult.
## 2                                                An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him.
## 3                                                                                                 Greed and class discrimination threaten the newly-formed symbiotic relationship between the wealthy Park family and the destitute Kim clan.
## 4            April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
## 5 Arthur Fleck, a party clown and a failed stand-up comedian, leads an impoverished life with his ailing mother. However, when society shuns him and brands him as a freak, he decides to embrace the life of crime and chaos in Gotham City.
## 6                                                                    Beneath Anna Poliatova's striking beauty lies a secret that will unleash her indelible strength and skill to become one of the world's most feared government assassins.
##   runtime votes votes_thous votes_in_thous
## 1     148  411K         411            411
## 2     113  409K         409            409
## 3     132  974K         974            974
## 4     119  685K         685            685
## 5     122  1.5M       15000          15000
## 6     118   97K          97             97

Problem 3: In the runtime of 116-135 mins, which movies are from the lowest ranked 5 out of 25?

Again, you must use the filter function to get the exact movie which answers this question. Be sure to state the rank and runtime for each movie.

filtered_movies2 <- df2 %>%
  filter(runtime >= 116 & runtime <= 135) %>%
  arrange(runtime) %>%
  slice(1:5)
head(filtered_movies2)
##   rank          title
## 1   16             Us
## 2    8           Anna
## 3   22  The Lion King
## 4    6           1917
## 5    7          Joker
##                                                                                                                                                                                                                                   description
## 1                                                              Adelaide Wilson and her family are attacked by mysterious figures dressed in red. Upon closer inspection, the Wilsons realize that the intruders are exact lookalikes of them.
## 2                                                                    Beneath Anna Poliatova's striking beauty lies a secret that will unleash her indelible strength and skill to become one of the world's most feared government assassins.
## 3                                                                                                         After the murder of his father, a young lion prince flees his kingdom only to learn the true meaning of responsibility and bravery.
## 4            April 6th, 1917. As an infantry battalion assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap.
## 5 Arthur Fleck, a party clown and a failed stand-up comedian, leads an impoverished life with his ailing mother. However, when society shuns him and brands him as a freak, he decides to embrace the life of crime and chaos in Gotham City.
##   runtime votes votes_thous votes_in_thous
## 1     116  345K         345            345
## 2     118   97K          97             97
## 3     118  269K         269            269
## 4     119  685K         685            685
## 5     122  1.5M       15000          15000