Webscraping Assignment

Author

Shadeja Fuentes

Loading packages

library(rvest)

Warning: package 'rvest' was built under R version 4.2.3

library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   1.0.1
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.3     ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter()         masks stats::filter()
✖ readr::guess_encoding() masks rvest::guess_encoding()
✖ dplyr::lag()            masks stats::lag()

url <- "https://www.imdb.com/search/title/?count=100&release_date=2016,2016&title_type=feature"

webpage <- read_html(url)

Get rankings data

Use CSS selector to scrape the rankings on the website (.text-primary corresponds to the rankings).

rank_data_html <- html_nodes(webpage, ".text-primary")

Convert the ranking data to text

rank_data <- html_text(rank_data_html)
head(rank_data)

[1] "1." "2." "3." "4." "5." "6."

Convert ranking data to numerical value

rank_data <- as.numeric(rank_data)
head(rank_data)

[1] 1 2 3 4 5 6

Get the data for titles, description, runtime, genre, rating, metascore, votes, Gross Earning (in millions), director, actor data and convert to text.

Titles data

title_data_html <- html_nodes(webpage, ".lister-item-header a")
title_data <- html_text(title_data_html)
head(title_data)

[1] "The Magnificent Seven"        "Me Before You"               
[3] "Rogue One: A Star Wars Story" "Hidden Figures"              
[5] "Suicide Squad"                "Sing"

Description data

description_data_html <- html_nodes(webpage, ".ratings-bar+ .text-muted")
description_data <- html_text(description_data_html)
head(description_data)

[1] "\nSeven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist."                                                          
[2] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."                                                                                                                            
[3] "\nIn a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
[4] "\nThe story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."                                                                              
[5] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                          
[6] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."

Remove “” from the description data

description_data<-gsub("\n","",description_data)
head(description_data)

[1] "Seven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist."                                                          
[2] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."                                                                                                                            
[3] "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
[4] "The story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."                                                                              
[5] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                          
[6] "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."

Runtime data

runtime_data_html <- html_nodes(webpage, ".text-muted .runtime")
runtime_data <- html_text(runtime_data_html)
head(runtime_data)

[1] "132 min" "106 min" "133 min" "127 min" "123 min" "108 min"

Remove “min” and convert to numerical value

runtime_data <- gsub(" min", "", runtime_data)
runtime_data <- as.numeric(runtime_data)
head(runtime_data)

[1] 132 106 133 127 123 108

Genre data

genre_data_html <- html_nodes(webpage, ".genre")
genre_data <-html_text(genre_data_html)
head(genre_data)

[1] "\nAction, Adventure, Western            "
[2] "\nDrama, Romance            "            
[3] "\nAction, Adventure, Sci-Fi            " 
[4] "\nBiography, Drama, History            " 
[5] "\nAction, Adventure, Fantasy            "
[6] "\nAnimation, Comedy, Family            "

Remove “”, excess spaces, take only the first genre of each movie , convert text to factor.

genre_data <- gsub("\n", "", genre_data)
genre_data <- gsub(" ", "", genre_data)
genre_data <- gsub (",.*", "", genre_data)
genre_data <- as.factor(genre_data)
head(genre_data)

[1] Action    Drama     Action    Biography Action    Animation
Levels: Action Adventure Animation Biography Comedy Crime Drama Horror

Ratings data

rating_data_html <- html_nodes(webpage, ".ratings-imdb-rating strong")
rating_data <- html_text(rating_data_html)
head(rating_data)

[1] "6.8" "7.4" "7.8" "7.8" "5.9" "7.1"

rating_data <- as.numeric(rating_data)
head(rating_data)

[1] 6.8 7.4 7.8 7.8 5.9 7.1

Votes data

votes_data_html <- html_nodes(webpage, ".sort-num_votes-visible span:nth-child(2)")
votes_data <- html_text(votes_data_html)
head(votes_data)

[1] "217,151" "263,304" "652,024" "238,315" "695,524" "176,666"

Remove comma from votes data and convert to numerical value

votes_data <- gsub(",", "", votes_data)
votes_data <- as.numeric(votes_data)
head(votes_data)

[1] 217151 263304 652024 238315 695524 176666

Directors data

directors_data_html <- html_nodes(webpage, ".text-muted+ p a:nth-child(1)")
directors_data <-html_text(directors_data_html)
directors_data <- as.factor(directors_data)
head(directors_data)

[1] Antoine Fuqua  Thea Sharrock  Gareth Edwards Theodore Melfi David Ayer    
[6] Garth Jennings
99 Levels: Aisling Walsh Alessandro Carloni Alex Proyas ... Zack Snyder

Actors Data

actors_data_html <-html_nodes(webpage, ".lister-item-content .ghost+ a")
actors_data <-html_text(actors_data_html)
actors_data <- as.factor(actors_data)
head(actors_data)

[1] Denzel Washington   Emilia Clarke       Felicity Jones     
[4] Taraji P. Henson    Will Smith          Matthew McConaughey
92 Levels: Adam Sandler Alexander Skarsgård Amy Adams ... Zoey Deutch

Metascore data

metascore_data_html <- html_nodes(webpage, ".metascore")
metascore_data <- html_text(metascore_data_html)
head(metascore_data)

[1] "54        " "51        " "65        " "74        " "40        "
[6] "59        "

Remove extra space in Metascore

metascore_data <- gsub(" ", "", metascore_data)
length(metascore_data)

[1] 96

Find Metascore data with missing values and replace with NAs. Scrape the ratings bar then extract the Metascore.

 ratings_bar_data <-html_nodes(webpage, ".ratings-bar") %>% html_text2()
head(ratings_bar_data)

[1] "6.8\nRate this\n 1 2 3 4 5 6 7 8 9 10 6.8/10 X \n54 Metascore"
[2] "7.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.4/10 X \n51 Metascore"
[3] "7.8\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.8/10 X \n65 Metascore"
[4] "7.8\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.8/10 X \n74 Metascore"
[5] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
[6] "7.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.1/10 X \n59 Metascore"

metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>% str_match ("\\d{2}") %>% as.numeric()
length(metascore_data)

[1] 100

Gross earnings data

gross_data_html <- html_nodes(webpage, ".ghost~ .text-muted+ span")
gross_data <-html_text(gross_data_html)
head(gross_data)

[1] "$93.43M"  "$56.25M"  "$532.18M" "$169.61M" "$325.10M" "$270.40M"

Find the missing gross earnings (automated). Earnings are part of the votes bar in the html, scrape the votes bar and extract earnings with a regular expression to get the NAs in context.

votes_bar_data <- html_nodes (webpage, ".sort-num_votes-visible") %>% html_text2()
gross_data <-str_match(votes_bar_data, "\\$.+$")
gross_data <- gsub("M", "", gross_data)
gross_data <- substring(gross_data, 2,6) %>% as.numeric()
length(gross_data)

[1] 100

Combine all lists to form movies_df data frame

movies_df <- data.frame(Rank = rank_data, Title = title_data, Description = description_data, Runtime = runtime_data, Genre = genre_data, Metascore = metascore_data, Director = directors_data, Actor = actors_data, Votes = votes_data, Rating = rating_data, Gross_Earning_Millions = gross_data)

Plot 1

qplot(data = movies_df,Runtime,fill = Genre,bins = 30)

Warning: `qplot()` was deprecated in ggplot2 3.4.0.

Question 1: Based on the above data, which movie from which Genre had the longest runtime?

Based on the plot above there are only 3 genres that have movies with a longer runtime of 150 minutes or more, Adventure, Action, and Drama. I can filter the movies_df data frame to show me only movies of 150 minutes and name it (filtered_movies). After filtering create a scatterplot of the relationship between runtime and IMDb score for movies with a runtime of 150 minutes or more. I can see that the movie with the longest runtime of 163 minutes is an adventure film called American Honey.

filtered_movies <- filter(movies_df, Runtime > 150)
ggplot(filtered_movies, aes(x = Runtime, y = Metascore, color = Genre)) +
  geom_point() +
  labs(title = "IMDB score for movies with a runtime of 150+ mins by genre", x = "Runtime (mins)", y = "IMDB score", color = "Genre")

Plot 2

ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))

Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

Based on the plot above I filtered the movies_df data to create a new data frame called filtered_movies2 by selecting only the rows of movies_df where the Runtime column is greater than 129 AND less than 160. The resulting filtered_movies2 data frame will only contain movies with a runtime between 129 and 160 minutes. Sort the filtered_movies2 data frame by decreasing number of votes and create a bar plot showing the number of votes for the top 10 movies. Captain America: Civil War had the most votes of any movie.

filtered_movies2 <- filter(movies_df, Runtime > 129, Runtime < 160 )
sorted_movies <- arrange(filtered_movies2, desc(Votes))
ggplot(head(sorted_movies, n = 10), aes(x = Title, y = Votes)) +
  geom_bar(stat = "identity", fill = "#3f7275") +
  coord_flip() +
  labs(title = "Top 10 movies by number of votes", x = "", y = "Number of votes")

options(scipen=999)

Plot 3

ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_Millions))+
geom_point(aes(size=Rating,col=Genre))

Warning: Removed 11 rows containing missing values (`geom_point()`).

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

Based on the plot above I first filter the movies_df for all movies with a runtime between 100-120 minutes naming that dataset filtered_movies3. Then arrange the data in descending order by Gross_Earning_Millions, calling the arranged dataset filtered_movies4. I then created a scatter plot showing the relationship between the runtime and gross earnings of movies with a runtime between 99 and 120 minutes. Each point on the plot represents a movie, with the x-coordinate showing the movie’s runtime and the y-coordinate showing the gross earnings. The points are colored based on the movie’s genre, with each genre represented by a different color. The plot can be used to identify any trends or patterns between the runtime and gross earnings of the movies. The adventure genre saw the hightest average gross earnings. The Jungle Book earned $364 million dollars, more than any other movie that had a run time between 100 and 120 minutes.

filtered_movies3 <- filter(movies_df, Runtime > 99, Runtime < 120)
filtered_movies4 <- arrange(filtered_movies3, desc(Gross_Earning_Millions))
head(filtered_movies4)

  Rank           Title
1   51 The Jungle Book
2    8        Deadpool
3   12        Zootopia
4    6            Sing
5   11           Moana
6   38  Doctor Strange
                                                                                                                                                                                                                    Description
1                         After a threat from the tiger Shere Khan forces him to flee the jungle, a man-cub named Mowgli embarks on a journey of self discovery with the help of panther Bagheera and free-spirited bear Baloo.
2                                                                                         A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks.
3                                                                                             In a city of anthropomorphic animals, a rookie bunny cop and a cynical con artist fox must work together to uncover a conspiracy.
4 In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same.
5                                                    In Ancient Polynesia, when a terrible curse incurred by the Demigod Maui reaches Moana's island, she answers the Ocean's call to seek out the Demigod to set things right.
6                                                                                                    While on a journey of physical and spiritual healing, a brilliant neurosurgeon is drawn into the world of the mystic arts.
  Runtime     Genre Metascore         Director                Actor   Votes
1     106 Adventure        77      Jon Favreau           Neel Sethi  281816
2     108    Action        65       Tim Miller        Ryan Reynolds 1057631
3     108 Animation        78     Byron Howard     Ginnifer Goodwin  511267
4     108 Animation        59   Garth Jennings  Matthew McConaughey  176666
5     107 Animation        81     Ron Clements      Auli'i Cravalho  346797
6     115    Action        72 Scott Derrickson Benedict Cumberbatch  758454
  Rating Gross_Earning_Millions
1    7.4                  364.0
2    8.0                  363.0
3    8.0                  341.2
4    7.1                  270.4
5    7.6                  248.7
6    7.5                  232.6

ggplot(filtered_movies4, aes(x = Runtime, y = Gross_Earning_Millions, group = Title)) +
 geom_point(aes(color = Genre))

Warning: Removed 3 rows containing missing values (`geom_point()`).

  labs(title = "Gross earnings over time for movies with a runtime between 99 and 120 minutes", x = "Runtime", y = "Gross earnings (millions of USD)", color = "Genre") +
  scale_x_continuous(breaks = seq(min(filtered_movies4$Runtime), max(filtered_movies4$Runtime), by = 5)) + theme_dark()

NULL