Load Package
library(rvest)
Set WD
setwd("/Users/smhenderson/Desktop/DATA110/Assignments")
Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
Reading the HTML code from the website
webpage <- read_html(url)
Rankings
#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')
#Converting the ranking data to text
rank_data <- html_text(rank_data_html)
#Let's have a look at the rankings
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)
#Let's have another look at the rankings
head(rank_data)
## [1] 1 2 3 4 5 6
Titles
#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')
#Converting the title data to text
title_data <- html_text(title_data_html)
#Let's have a look at the title
head(title_data)
## [1] "Arrival" "Hacksaw Ridge"
## [3] "Terrifier" "Suicide Squad"
## [5] "Batman v Superman: Dawn of Justice" "Me Before You"
Descriptions
#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#Let's have a look at the description data
head(description_data)
## [1] "\nA linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."
## [2] "\nWorld War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
## [3] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."
## [4] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [5] "\nFearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."
## [6] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)
#Let's have another look at the description data
head(description_data)
## [1] "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."
## [2] "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
## [3] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."
## [4] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [5] "Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."
## [6] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
Movie runtime
#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
#Let's have a look at the runtime
head(runtime_data)
## [1] "116 min" "139 min" "85 min" "123 min" "151 min" "106 min"
#Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 116 139 85 123 151 106
Movie genre
#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')
#Converting the genre data to text
genre_data <- html_text(genre_data_html)
#Let's have a look at the runtime
head(genre_data)
## [1] "\nDrama, Mystery, Sci-Fi "
## [2] "\nBiography, Drama, History "
## [3] "\nHorror, Thriller "
## [4] "\nAction, Adventure, Fantasy "
## [5] "\nAction, Adventure, Sci-Fi "
## [6] "\nDrama, Romance "
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)
#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)
#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)
#Convering each genre from text to factor
genre_data<-as.factor(genre_data)
#Let's have another look at the genre data
head(genre_data)
## [1] Drama Biography Horror Action Action Drama
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
IMDB rating
#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
#Converting the ratings data to text
rating_data <- html_text(rating_data_html)
#Let's have a look at the ratings
head(rating_data)
## [1] "7.9" "8.1" "5.6" "5.9" "6.4" "7.4"
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)
#Let's have another look at the ratings data
head(rating_data)
## [1] 7.9 8.1 5.6 5.9 6.4 7.4
Votes
#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
#Converting the votes data to text
votes_data <- html_text(votes_data_html)
#Let's have a look at the votes data
head(votes_data)
## [1] "722,867" "553,396" "43,192" "701,331" "714,469" "268,236"
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)
#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)
#Let's have another look at the votes data
head(votes_data)
## [1] 722867 553396 43192 701331 714469 268236
Directors
#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
#Converting the directors data to text
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)
## [1] "Denis Villeneuve" "Mel Gibson" "Damien Leone" "David Ayer"
## [5] "Zack Snyder" "Thea Sharrock"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)
#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
#Let's have a look at the actors data
head(actors_data)
## [1] "Amy Adams" "Andrew Garfield" "Jenna Kanell" "Will Smith"
## [5] "Ben Affleck" "Emilia Clarke"
#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)
Find the missing gross earnings (automated) Earnings are part of the
votes bar in the html, scrape the votes bar and extract earnings with a
regular expression to get the NAs in context
# scrape the votess bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
html_text2()
head(votes_bar_data) # look at the votes bar data
## [1] "Votes: 722,867 | Gross: $100.55M" "Votes: 553,396 | Gross: $67.21M"
## [3] "Votes: 43,192" "Votes: 701,331 | Gross: $325.10M"
## [5] "Votes: 714,469 | Gross: $330.36M" "Votes: 268,236 | Gross: $56.25M"
gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign
gross_data <- substring(gross_data,2,6) %>% # clean data: remove '$' sign
as.numeric()
length(gross_data)
## [1] 100
Create a dataframe and inspect its structure
#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data, Description = description_data, Runtime = runtime_data, Genre = genre_data, Rating = rating_data, Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data, Director = directors_data, Actor = actors_data)
#Structure of the data frame
str(movies_df)
## 'data.frame': 200 obs. of 11 variables:
## $ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "Arrival" "Hacksaw Ridge" "Terrifier" "Suicide Squad" ...
## $ Description : chr "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appea"| __truncated__ "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill peop"| __truncated__ "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ ...
## $ Runtime : num 116 139 85 123 151 106 108 111 128 107 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 7 4 8 1 1 7 1 7 5 3 ...
## $ Rating : num 7.9 8.1 5.6 5.9 6.4 7.4 8 6.5 8 7.6 ...
## $ Metascore : num 81 71 NA 40 44 51 65 26 94 81 ...
## $ Votes : num 722867 553396 43192 701331 714469 ...
## $ Gross_Earning_in_Mil: num 100.5 67.2 NA 325.1 330.3 ...
## $ Director : Factor w/ 97 levels "Alex Proyas",..: 29 62 20 25 97 90 93 83 19 81 ...
## $ Actor : Factor w/ 90 levels "Aamir Khan","Adam Driver",..: 4 5 41 88 8 28 74 10 73 7 ...
Analyzing scraped data from the web
library('ggplot2')
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Which movie from which Genre had the longest runtime?
#Load package and prepare data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
long_runtime <- movies_df %>%
arrange(desc(Runtime)) %>%
head(10)
#Bar graph showing the movie and genre with the lowest runtime
ggplot(long_runtime, aes(x = str_wrap(Title, width = 15), y = Runtime, fill = Genre)) +
geom_bar(stat = "identity") +
labs(x = "Movies", y = "Runtime", title = "Longest Runtime by Movie and Genre") +
theme_bw() +
theme(axis.text.x = element_text(hjust = 0.5, vjust = 1, size = 8),
plot.title = element_text(hjust = 0.5),
panel.grid = element_blank()) +
scale_fill_discrete(name = "Genre")

ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))

In the Runtime of 130-160 mins, which genre has the highest
votes?
#Load package and prepare data
high_votes <- movies_df %>%
filter(Runtime >=130 & Runtime <= 160) %>%
group_by(Genre, Runtime, .groups = "drop") %>%
summarize(vote_sum = sum(Votes)) %>%
arrange(desc(vote_sum))
## `summarise()` has grouped output by 'Genre', 'Runtime'. You can override using
## the `.groups` argument.
#Scatterplot showing which genre has the highest votes in the Runtime of 130-160 mins
ggplot(high_votes, aes(x = Runtime, y = vote_sum)) +
geom_point(aes(size = vote_sum, col = Genre)) +
labs(title = "Genre with the Highest Votes in the Runtime of 130-160 Mins", x = "Runtime", y = "Votes") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.key = element_blank())

options(scipen = 999)
ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))
## Warning: Removed 18 rows containing missing values (`geom_point()`).

Across all genres, which genre has the highest average gross
earnings in runtime 100 to 120?
high_gross <- movies_df %>%
filter(Runtime >=100 & Runtime <= 120) %>%
group_by(Genre, Runtime, .groups = "drop") %>%
summarize(avg_gross = mean(Gross_Earning_in_Mil)) %>%
arrange(desc(avg_gross))
## `summarise()` has grouped output by 'Genre', 'Runtime'. You can override using
## the `.groups` argument.
#Scatterplot showing which genre has the highest average gross earnings in runtime 100 to 120
ggplot(high_gross, aes(x = Runtime, y = avg_gross)) +
geom_point(aes(size = avg_gross, col = Genre)) +
labs(title = "Genres with the Highest Average Gross Earnings by Runtime", x = "Runtime", y = "Avg Gross Earnings") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.key = element_blank())
## Warning: Removed 1 rows containing missing values (`geom_point()`).

options(scipen = 999)