Install packages
#install.packages('rvest')
#Loading the rvest package
library('rvest')
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
Scrape the IMDB website
#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
#Reading the HTML code from the website
webpage <- read_html(url)
Scrape the titles
#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')
#Converting the title data to text
title_data <- html_text(title_data_html)
#Let's have a look at the title
title_data
## [1] "Terrifier"
## [2] "Star Wars: Rogue One"
## [3] "Sing"
## [4] "Deadpool"
## [5] "Die Taschendiebin"
## [6] "Suicide Squad"
## [7] "Split"
## [8] "Don't Breathe"
## [9] "Hacksaw Ridge - Die Entscheidung"
## [10] "X-Men: Apocalypse"
## [11] "Arrival"
## [12] "La La Land"
## [13] "Conjuring 2"
## [14] "Ghostbusters"
## [15] "The Nice Guys"
## [16] "Vaiana - Das Paradies hat einen Haken"
## [17] "The Watcher"
## [18] "Train to Busan"
## [19] "Still"
## [20] "Hidden Figures"
## [21] "13 Hours: The Secret Soldiers of Benghazi"
## [22] "Nocturnal Animals"
## [23] "Die 5. Welle"
## [24] "Ein ganzes halbes Jahr"
## [25] "Doctor Strange"
## [26] "The First Avenger: Civil War"
## [27] "The Love Witch"
## [28] "Your Name."
## [29] "Zoomania"
## [30] "Stolz und Vorurteil & Zombies"
## [31] "Batman v Superman: Dawn of Justice"
## [32] "Phantastische Tierwesen und wo sie zu finden sind"
## [33] "Raw"
## [34] "Die Insel der besonderen Kinder"
## [35] "Passengers"
## [36] "10 Cloverfield Lane"
## [37] "Lights Out"
## [38] "Manchester by the Sea"
## [39] "The Autopsy of Jane Doe"
## [40] "Die glorreichen Sieben"
## [41] "Hell or High Water"
## [42] "Popstar: Never Stop Never Stopping"
## [43] "Captain Fantastic: Einmal Wildnis und zurück"
## [44] "The Choice - Bis zum letzten Tag"
## [45] "Allied: Vertraute Fremde"
## [46] "Moonlight"
## [47] "Ouija: Ursprung des Bösen"
## [48] "War Dogs"
## [49] "Below Her Mouth"
## [50] "Der unsichtbare Gast"
## [51] "A Cure for Wellness"
## [52] "Girl on the Train"
## [53] "The Neon Demon"
## [54] "The Shallows: Gefahr aus der Tiefe"
## [55] "Die versunkene Stadt Z"
## [56] "Sausage Paarty - Es geht um die Wurst"
## [57] "The Wailing: Die Besessenen"
## [58] "31"
## [59] "Warcraft: The Beginning"
## [60] "Bad Moms"
## [61] "Swiss Army Man"
## [62] "Legend of Tarzan"
## [63] "Star Trek Beyond"
## [64] "Central Intelligence"
## [65] "The Boy"
## [66] "The Accountant"
## [67] "Mashina lyubvi"
## [68] "Bastille Day"
## [69] "The Bad Batch"
## [70] "Dirty Grandpa"
## [71] "Lion: Der lange Weg nach Hause"
## [72] "Blair Witch"
## [73] "The Jungle Book"
## [74] "The Huntsman & the Ice Queen"
## [75] "Dangal: Die Hoffnung auf den großen Sieg"
## [76] "Inferno"
## [77] "Jason Bourne"
## [78] "The Founder"
## [79] "The Purge: Election Year"
## [80] "Assassin's Creed"
## [81] "Independence Day: Wiederkehr"
## [82] "Deepwater Horizon"
## [83] "The Great Wall"
## [84] "Silence"
## [85] "Die Bestimmung - Allegiant"
## [86] "Wo die wilden Menschen jagen"
## [87] "Gods of Egypt"
## [88] "Boston"
## [89] "Batman v Superman: Dawn of Justice Ultimate Edition"
## [90] "Hail, Caesar!"
## [91] "Der Spion und sein Bruder"
## [92] "How to Be Single"
## [93] "Sully"
## [94] "Everybody Wants Some!!"
## [95] "Jack Reacher: Kein Weg zurück"
## [96] "Before I Wake"
## [97] "Trolls"
## [98] "Koe no katachi"
## [99] "The Void"
## [100] "Pets"
Scrape description data and remove NA’s
#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#Let's have a look at the description data
head(description_data)
## [1] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."
## [2] "\nIn a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."
## [3] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [4] "\nA wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."
## [5] "\nA woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her."
## [6] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)
#Let's have another look at the description data
head(description_data)
## [1] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."
## [2] "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."
## [3] "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [4] "A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."
## [5] "A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her."
## [6] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
Scrape the movie runtime
#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
#Let's have a look at the runtime
head(runtime_data)
## [1] "85 min" "133 min" "108 min" "108 min" "145 min" "123 min"
#Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 85 133 108 108 145 123
Scrape the movie genre
#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')
#Converting the genre data to text
genre_data <- html_text(genre_data_html)
#Let's have a look at the runtime
head(genre_data)
## [1] "\nHorror, Thriller "
## [2] "\nAction, Adventure, Sci-Fi "
## [3] "\nAnimation, Comedy, Family "
## [4] "\nAction, Adventure, Comedy "
## [5] "\nDrama, Romance, Thriller "
## [6] "\nAction, Adventure, Fantasy "
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)
#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)
#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)
#Convering each genre from text to factor
genre_data<-as.factor(genre_data)
#Let's have another look at the genre data
head(genre_data)
## [1] Horror Action Animation Action Drama Action
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
Scrape the IMDB rating
#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
#Converting the ratings data to text
rating_data <- html_text(rating_data_html)
#Let's have a look at the ratings
head(rating_data)
## [1] "5.6" "7.8" "7.1" "8.0" "8.1" "5.9"
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)
#Let's have another look at the ratings data
head(rating_data)
## [1] 5.6 7.8 7.1 8.0 8.1 5.9
Scrape the votes
#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
#Converting the votes data to text
votes_data <- html_text(votes_data_html)
#Let's have a look at the votes data
head(votes_data)
## [1] "25,977" "630,960" "169,937" "1,028,962" "146,512" "683,422"
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)
#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)
#Let's have another look at the votes data
head(votes_data)
## [1] 25977 630960 169937 1028962 146512 683422
Scrape the directors and actors
#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
#Converting the directors data to text
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)
## [1] "Damien Leone" "Gareth Edwards" "Garth Jennings" "Tim Miller"
## [5] "Park Chan-wook" "David Ayer"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)
## Actors
#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
#Let's have a look at the actors data
head(actors_data)
## [1] "Jenna Kanell" "Felicity Jones" "Matthew McConaughey"
## [4] "Ryan Reynolds" "Kim Min-hee" "Will Smith"
Scrape gross earnnings
Find the missing gross earnings (automated) Earnings are part of the
votes bar in the html, scrape the votes bar and extract earnings with a
regular expression to get the NAs in context.
# scrape the votess bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
html_text2()
head(votes_bar_data) # look at the votes bar data
## [1] "Votes: 25,977" "Votes: 630,960 | Gross: $532.18M"
## [3] "Votes: 169,937 | Gross: $270.40M" "Votes: 1,028,962 | Gross: $363.07M"
## [5] "Votes: 146,512 | Gross: $2.01M" "Votes: 683,422 | Gross: $325.10M"
gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign
gross_data <- substring(gross_data,2,6) %>% # clean data: remove '$' sign
as.numeric()
length(gross_data)
## [1] 100
head(gross_data)
## [1] NA 532.10 270.40 363.00 2.01 325.10
Create dataframe
#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data, Description = description_data, Runtime = runtime_data, Genre = genre_data, Rating = rating_data, Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data, Director = directors_data, Actor = actors_data)
#Structure of the data frame
str(movies_df)
## 'data.frame': 100 obs. of 11 variables:
## $ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "Terrifier" "Star Wars: Rogue One" "Sing" "Deadpool" ...
## $ Description : chr "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death St"| __truncated__ "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing compe"| __truncated__ "A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man"| __truncated__ ...
## $ Runtime : num 85 133 108 108 145 123 117 88 139 144 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 8 1 3 1 7 1 8 6 4 1 ...
## $ Rating : num 5.6 7.8 7.1 8 8.1 5.9 7.3 7.1 8.1 6.9 ...
## $ Metascore : num NA 65 59 65 84 40 62 71 71 52 ...
## $ Votes : num 25977 630960 169937 1028962 146512 ...
## $ Gross_Earning_in_Mil: num NA 532.1 270.4 363 2.01 ...
## $ Director : Factor w/ 95 levels "Adam Wingard",..: 19 32 34 90 67 23 53 31 57 11 ...
## $ Actor : chr "Jenna Kanell" "Felicity Jones" "Matthew McConaughey" "Ryan Reynolds" ...
Question 1: Based on the above data, which movie from which genre
had the longest runtime?
p1 <- movies_df %>%
ggplot(aes(x=Runtime, fill=Genre)) +
geom_histogram(position = "identity", alpha=0.5, binwidth = 5, color = "white") +
scale_fill_discrete(name = "Genre") +
labs(title = "Top 100 Movies of 2016 Runtime by Genre")
ggplotly(p1)
highestRuntime <- movies_df %>%
select(Title, Genre, Runtime) %>%
arrange(desc(Runtime))
head(highestRuntime)
## Title Genre Runtime
## 1 Batman v Superman: Dawn of Justice Ultimate Edition Action 182
## 2 Dangal: Die Hoffnung auf den großen Sieg Action 161
## 3 Silence Drama 161
## 4 The Wailing: Die Besessenen Drama 156
## 5 Batman v Superman: Dawn of Justice Action 152
## 6 The First Avenger: Civil War Action 147
Batman v Superman: Dawn of Justice Ultimate Edition from the Action
genre has the highest runtime of 182 minutes.
Question 2: Based on the above data, in the Runtime of 130-160 mins,
which genre has the highest votes?
p2 <- movies_df %>%
ggplot(aes(x=Runtime, y=Rating)) +
geom_point(aes(size = Votes, col = Genre, text = paste("Movie Title:", title_data)), alpha = 0.7) +
labs(title = "Top 100 Movies of 2016 Runtime by Ratings")
## Warning: Ignoring unknown aesthetics: text
ggplotly(p2)
highestVotes <- movies_df %>%
filter(Runtime >= 130 & Runtime <= 160) %>%
select(Title, Genre, Runtime, Votes) %>%
arrange(desc(Votes))
head(highestVotes)
## Title Genre Runtime Votes
## 1 The First Avenger: Civil War Action 147 781401
## 2 Batman v Superman: Dawn of Justice Action 152 697251
## 3 Star Wars: Rogue One Action 133 630960
## 4 Hacksaw Ridge - Die Entscheidung Biography 139 521030
## 5 Phantastische Tierwesen und wo sie zu finden sind Adventure 132 474239
## 6 X-Men: Apocalypse Action 144 434600
Captain America: Civil War in the Action genre has the highest votes
of 781,401 with a runtime of 147 minutes.
Question 3: Based on the above data, across all genres which genre
has the highest average gross earnings in runtime 100 to 120.
p3 <- movies_df %>%
ggplot(aes(x=Runtime, y=Gross_Earning_in_Mil)) +
geom_point(aes(size = Rating, col = Genre), alpha = 0.5) +
labs(title = "Top 100 Movies of 2016 Runtime by Gross Earnings in Millions") +
scale_y_continuous("Gross Earnings in Millions", limits = c(-10,600))
ggplotly(p3)
highestEarnings <- movies_df %>%
filter(Runtime >= 100 & Runtime <= 120 & !is.na(Gross_Earning_in_Mil)) %>%
select(Genre, Gross_Earning_in_Mil) %>%
group_by(Genre) %>%
summarize(avgGrossEarn = mean(Gross_Earning_in_Mil)) %>%
arrange(desc(avgGrossEarn))
head(highestEarnings)
## # A tibble: 6 × 2
## Genre avgGrossEarn
## <fct> <dbl>
## 1 Animation 216.
## 2 Adventure 185.
## 3 Action 87.8
## 4 Drama 55.2
## 5 Crime 51.1
## 6 Horror 46.8
The Animation genre has the highest average gross earnings with
$216.33 million.