url <- "https://www.imdb.com/search/title/?count=100&release_date=2016,2016&title_type=feature"
webpage <- read_html(url)
We’ll be scraping the following data from this website.
Rank: numeric, the rank of the film from 1 to 100 on the list of 100 most popular feature films released in 2016
Title: character, the title of the feature film
Description: character, the description of the feature film
Runtime: numeric, the duration of the feature film
Genre: factor, the genre of the feature film. In case of multiple genres, take only the first
Rating: numeric, the IMDb rating of the feature film
Metascore: numeric, the metascore on IMDb website for the feature film
Votes: numeric, votes cast in favor of the feature film
Gross_Earning_in_Mil: numeric, the gross earnings of the feature film in millions
Director: factor, the main director of the feature film. In case of multiple directors, take only the first
Actor: factor, the main actor in the feature film. In case of multiple actors, take only the first
Use the selector gadget to get the specific CSS selector that encloses the rankings. Copy the corresponding CSS selector in the bottom center and paste it in the R code below to get all the rankings.
# using CSS selector to scrape the rankings section
rank_data_html <- html_nodes(webpage, ".text-primary")
# converting the ranking data to text
rank_data <- html_text(rank_data_html)
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
Convert the class of rankings to numeric.
rank_data<-as.numeric(rank_data)
head(rank_data)
## [1] 1 2 3 4 5 6
title_data_html <- html_nodes(webpage, ".lister-item-header a")
title_data <- html_text(title_data_html)
head(title_data)
## [1] "Moana"
## [2] "Moonlight"
## [3] "Suicide Squad"
## [4] "Rogue One: A Star Wars Story"
## [5] "Miss Peregrine's Home for Peculiar Children"
## [6] "La La Land"
description_data_html <- html_nodes(webpage, ".ratings-bar+ .text-muted")
description_data <- html_text(description_data_html)
head(description_data)
## [1] "\n In Ancient Polynesia, when a terrible curse incurred by the Demigod Maui reaches Moana's island, she answers the Ocean's call to seek out the Demigod to set things right."
## [2] "\n A young African-American man grapples with his identity and sexuality while experiencing the everyday struggles of childhood, adolescence, and burgeoning adulthood."
## [3] "\n A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [4] "\n The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the Death Star plans."
## [5] "\n When Jacob (Asa Butterfield) discovers clues to a mystery that stretches across time, he finds Miss Peregrine's Home for Peculiar Children. But the danger deepens after he gets to know the residents and learns about their special powers."
## [6] "\n While navigating their careers in Los Angeles, a pianist and an actress fall in love while attempting to reconcile their aspirations for the future."
Remove backslashes, n and extra spaces.
description_data<-gsub("\n ", "", description_data)
head(description_data)
## [1] "In Ancient Polynesia, when a terrible curse incurred by the Demigod Maui reaches Moana's island, she answers the Ocean's call to seek out the Demigod to set things right."
## [2] "A young African-American man grapples with his identity and sexuality while experiencing the everyday struggles of childhood, adolescence, and burgeoning adulthood."
## [3] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [4] "The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the Death Star plans."
## [5] "When Jacob (Asa Butterfield) discovers clues to a mystery that stretches across time, he finds Miss Peregrine's Home for Peculiar Children. But the danger deepens after he gets to know the residents and learns about their special powers."
## [6] "While navigating their careers in Los Angeles, a pianist and an actress fall in love while attempting to reconcile their aspirations for the future."
runtime_data_html <- html_nodes(webpage, ".runtime")
runtime_data <- html_text(runtime_data_html)
head(runtime_data)
## [1] "107 min" "111 min" "123 min" "133 min" "127 min" "128 min"
Remove mins.
runtime_data<-gsub(" min", "", runtime_data)
runtime_data<-as.numeric(runtime_data)
head(runtime_data)
## [1] 107 111 123 133 127 128
genre_data_html <- html_nodes(webpage, ".genre")
genre_data <- html_text(genre_data_html)
head(genre_data)
## [1] "\nAnimation, Adventure, Comedy "
## [2] "\nDrama "
## [3] "\nAction, Adventure, Fantasy "
## [4] "\nAction, Adventure, Sci-Fi "
## [5] "\nAdventure, Drama, Family "
## [6] "\nComedy, Drama, Music "
Remove backslashes, n, and trailing spaces and take only the first genre of each movie.
genre_data<-gsub("\n", "", genre_data)
genre_data<-gsub(" ", "", genre_data)
genre_data<-gsub(",.*", "", genre_data) # taking only the first genre of each movie
genre_data <- as.factor(genre_data)
head(genre_data)
## [1] Animation Drama Action Action Adventure Comedy
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
rating_data_html <- html_nodes(webpage, ".ratings-imdb-rating strong")
rating_data <- html_text(rating_data_html)
head(rating_data)
## [1] "7.6" "7.4" "6.0" "7.8" "6.7" "8.0"
rating_data <- as.numeric(rating_data)
head(rating_data)
## [1] 7.6 7.4 6.0 7.8 6.7 8.0
votes_data_html <- html_nodes(webpage, ".sort-num_votes-visible span:nth-child(2)")
votes_data <- html_text(votes_data_html)
head(votes_data)
## [1] "254,991" "258,649" "580,808" "532,968" "150,557" "480,760"
Remove commas.
votes_data<-gsub(",", "", votes_data)
votes_data <- as.numeric(votes_data)
head(votes_data)
## [1] 254991 258649 580808 532968 150557 480760
directors_data_html <- html_nodes(webpage, ".text-muted+ p a:nth-child(1)")
directors_data <- html_text(directors_data_html)
head(directors_data)
## [1] "Ron Clements" "Barry Jenkins" "David Ayer" "Gareth Edwards"
## [5] "Tim Burton" "Damien Chazelle"
directors_data<-gsub(",.*", "", directors_data) # taking only the first director
directors_data <- as.factor(directors_data)
head(directors_data)
## [1] Ron Clements Barry Jenkins David Ayer Gareth Edwards
## [5] Tim Burton Damien Chazelle
## 98 Levels: Alex Proyas Ana Lily Amirpour André Øvredal ... Zack Snyder
actors_data_html <- html_nodes(webpage, ".lister-item-content .ghost+ a")
actors_data <- html_text(actors_data_html)
head(actors_data)
## [1] "Auli'i Cravalho" "Mahershala Ali" "Will Smith" "Felicity Jones"
## [5] "Eva Green" "Ryan Gosling"
actors_data<-gsub(",.*", "", actors_data) # taking only the first actor
actors_data <- as.factor(actors_data)
head(actors_data)
## [1] Auli'i Cravalho Mahershala Ali Will Smith Felicity Jones
## [5] Eva Green Ryan Gosling
## 92 Levels: Aamir Khan Adam Driver Adam Sandler ... Zoey Deutch
metascore_data_html <- html_nodes(webpage, ".metascore")
metascore_data <- html_text(metascore_data_html)
head(metascore_data)
## [1] "81 " "99 " "40 " "65 " "57 "
## [6] "94 "
metascore_data<-gsub(" ", "", metascore_data)
head(metascore_data)
## [1] "81" "99" "40" "65" "57" "94"
length(metascore_data)
## [1] 98
Since we are dealing with 100 movies, there are 2 movies that don’t have the corresponding Metascore fields. We simply add NAs to those two entries using a for loop. After a visual inspection, find that the Metascore is missing for movies 22 and 80.
for (i in c(22, 80)) {
a<-metascore_data[1:(i-1)]
b<-metascore_data[i:length(metascore_data)]
metascore_data<-append(a,list("NA"))
metascore_data<-append(metascore_data,b) }
metascore_data<-as.numeric(metascore_data)
length(metascore_data)
## [1] 100
summary(metascore_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.00 47.25 62.00 60.19 73.50 99.00 2
gross_data_html <- html_nodes(webpage, ".ghost~ .text-muted+ span")
gross_data <- html_text(gross_data_html)
head(gross_data)
## [1] "$248.76M" "$27.85M" "$325.10M" "$532.18M" "$87.24M" "$151.10M"
Remove $ and M.
gross_data <- gsub("\\$", "", gross_data)
gross_data <-gsub("M", "", gross_data)
gross_data <- as.numeric(gross_data)
head(gross_data)
## [1] 248.76 27.85 325.10 532.18 87.24 151.10
length(gross_data)
## [1] 90
Since we are dealing with 100 movies, there are 11 movies that don’t have the corresponding gross earning fields. We simply add NAs to those entries. After a visual inspection, find that the Gross Earning is missing for movies 22, 48, 52, 63, 72, 84, 91, 93, 94 and 100.
for (i in c(22, 48, 52, 63, 72, 84, 91, 93, 94)) {
a<-gross_data[1:(i-1)]
b<-gross_data[i:length(gross_data)]
gross_data <- append(a,list("NA"))
gross_data <- append(gross_data,b) }
gross_data <- c(gross_data, "NA") # adding "NA" to the 100th place
gross_data <- as.numeric(gross_data)
length(gross_data)
## [1] 100
summary(gross_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.18 27.11 58.17 99.60 126.25 532.18 10
Now we have successfully scraped all the 11 features for the 100 most popular feature films released in 2016. Let’s combine them to create a dataframe and inspect its basic structure.
top_movies <- data.frame(Rank = rank_data, Title = title_data,
Description = description_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data,
Gross_Earning_in_Mil = gross_data,
Director = directors_data, Actor = actors_data)
glimpse(top_movies)
## Rows: 100
## Columns: 11
## $ Rank <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ Title <chr> "Moana", "Moonlight", "Suicide Squad", "Rogue ...
## $ Description <chr> "In Ancient Polynesia, when a terrible curse i...
## $ Runtime <dbl> 107, 111, 123, 133, 127, 128, 151, 108, 116, 1...
## $ Genre <fct> Animation, Drama, Action, Action, Adventure, C...
## $ Rating <dbl> 7.6, 7.4, 6.0, 7.8, 6.7, 8.0, 6.5, 8.0, 7.9, 7...
## $ Metascore <dbl> 81, 99, 40, 65, 57, 94, 44, 78, 81, 70, 71, 72...
## $ Votes <dbl> 254991, 258649, 580808, 532968, 150557, 480760...
## $ Gross_Earning_in_Mil <dbl> 248.76, 27.85, 325.10, 532.18, 87.24, 151.10, ...
## $ Director <fct> Ron Clements, Barry Jenkins, David Ayer, Garet...
## $ Actor <fct> Auli'i Cravalho, Mahershala Ali, Will Smith, F...
ggplot(top_movies) +
geom_histogram(aes(Runtime, fill = Genre), bins = 30) +
scale_x_continuous(breaks = seq(80, 170, 10)) +
labs(x = "Runtime (min)", y = "Frequency",
title = "Histogram of Runtime Stacked up by Genre") +
theme(plot.title = element_text(hjust = 0.5))
Question 1: Based on the above data, which movie from which Genre had the longest runtime? Its genre is a drama that has runtime between 162.5 and 165 minutes. Let’s identify the movie with its runtime.
ind_max_runtime <- which.max(top_movies$Runtime)
top_movies$Runtime[ind_max_runtime]
## [1] 163
top_movies$Title[ind_max_runtime]
## [1] "American Honey"
top_movies$Genre[ind_max_runtime]
## [1] Drama
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
ggplot(top_movies) +
geom_point(aes(Runtime, Rating, color = Genre, size = Votes)) +
scale_x_continuous(breaks = seq(80, 170, 10)) +
labs(x = "Runtime (min)", y = "Rating",
title = "Scatterplot of Runtime vs Rating") +
theme(plot.title = element_text(hjust = 0.5))
Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes? Action. Let’s check.
movies_run_130_160 <- top_movies %>%
filter(Runtime >= 130 & Runtime <= 160)
ind_max_votes <- which.max(movies_run_130_160$Votes)
movies_run_130_160$Votes[ind_max_votes]
## [1] 637040
movies_run_130_160$Title[ind_max_votes]
## [1] "Captain America: Civil War"
movies_run_130_160$Genre[ind_max_votes]
## [1] Action
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
ggplot(top_movies) +
geom_point(aes(Runtime, Gross_Earning_in_Mil, color = Genre, size = Rating)) +
scale_x_continuous(breaks = seq(80, 170, 10)) +
labs(x = "Runtime (min)", y = "Gross Earning (millions)",
title = "Scatterplot of Runtime vs Gross Earning") +
theme(plot.title = element_text(hjust = 0.5))
Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120? Adventure. Let’s check.
movies_run_100_120 <- top_movies %>%
filter(Runtime >= 100 & Runtime <= 120)
ind_max_gross <- which.max(movies_run_100_120$Gross_Earning_in_Mil)
movies_run_100_120$Gross_Earning_in_Mil[ind_max_gross]
## [1] 364
movies_run_100_120$Title[ind_max_gross]
## [1] "The Jungle Book"
movies_run_100_120$Genre[ind_max_gross]
## [1] Adventure
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror