Install packages

#install.packages('rvest')
#Loading the rvest package
library('rvest')
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Scrape the IMDB website

#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

#Reading the HTML code from the website
webpage <- read_html(url)

Scrape ranking information

#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

#Converting the ranking data to text
rank_data <- html_text(rank_data_html)

#Let's have a look at the rankings
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."

Clean data by adjusting the format from character string to numeric

#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

#Let's have another look at the rankings
head(rank_data)
## [1] 1 2 3 4 5 6
length(rank_data)
## [1] 100

Scrape the titles

#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)

#Let's have a look at the title
title_data
##   [1] "Terrifier"                                          
##   [2] "Star Wars: Rogue One"                               
##   [3] "Sing"                                               
##   [4] "Deadpool"                                           
##   [5] "Die Taschendiebin"                                  
##   [6] "Suicide Squad"                                      
##   [7] "Split"                                              
##   [8] "Don't Breathe"                                      
##   [9] "Hacksaw Ridge - Die Entscheidung"                   
##  [10] "X-Men: Apocalypse"                                  
##  [11] "Arrival"                                            
##  [12] "La La Land"                                         
##  [13] "Conjuring 2"                                        
##  [14] "Ghostbusters"                                       
##  [15] "The Nice Guys"                                      
##  [16] "Vaiana - Das Paradies hat einen Haken"              
##  [17] "The Watcher"                                        
##  [18] "Train to Busan"                                     
##  [19] "Still"                                              
##  [20] "Hidden Figures"                                     
##  [21] "13 Hours: The Secret Soldiers of Benghazi"          
##  [22] "Nocturnal Animals"                                  
##  [23] "Die 5. Welle"                                       
##  [24] "Ein ganzes halbes Jahr"                             
##  [25] "Doctor Strange"                                     
##  [26] "The First Avenger: Civil War"                       
##  [27] "The Love Witch"                                     
##  [28] "Your Name."                                         
##  [29] "Zoomania"                                           
##  [30] "Stolz und Vorurteil & Zombies"                      
##  [31] "Batman v Superman: Dawn of Justice"                 
##  [32] "Phantastische Tierwesen und wo sie zu finden sind"  
##  [33] "Raw"                                                
##  [34] "Die Insel der besonderen Kinder"                    
##  [35] "Passengers"                                         
##  [36] "10 Cloverfield Lane"                                
##  [37] "Lights Out"                                         
##  [38] "Manchester by the Sea"                              
##  [39] "The Autopsy of Jane Doe"                            
##  [40] "Die glorreichen Sieben"                             
##  [41] "Hell or High Water"                                 
##  [42] "Popstar: Never Stop Never Stopping"                 
##  [43] "Captain Fantastic: Einmal Wildnis und zurück"       
##  [44] "The Choice - Bis zum letzten Tag"                   
##  [45] "Allied: Vertraute Fremde"                           
##  [46] "Moonlight"                                          
##  [47] "Ouija: Ursprung des Bösen"                          
##  [48] "War Dogs"                                           
##  [49] "Below Her Mouth"                                    
##  [50] "Der unsichtbare Gast"                               
##  [51] "A Cure for Wellness"                                
##  [52] "Girl on the Train"                                  
##  [53] "The Neon Demon"                                     
##  [54] "The Shallows: Gefahr aus der Tiefe"                 
##  [55] "Die versunkene Stadt Z"                             
##  [56] "Sausage Paarty - Es geht um die Wurst"              
##  [57] "The Wailing: Die Besessenen"                        
##  [58] "31"                                                 
##  [59] "Warcraft: The Beginning"                            
##  [60] "Bad Moms"                                           
##  [61] "Swiss Army Man"                                     
##  [62] "Legend of Tarzan"                                   
##  [63] "Star Trek Beyond"                                   
##  [64] "Central Intelligence"                               
##  [65] "The Boy"                                            
##  [66] "The Accountant"                                     
##  [67] "Mashina lyubvi"                                     
##  [68] "Bastille Day"                                       
##  [69] "The Bad Batch"                                      
##  [70] "Dirty Grandpa"                                      
##  [71] "Lion: Der lange Weg nach Hause"                     
##  [72] "Blair Witch"                                        
##  [73] "The Jungle Book"                                    
##  [74] "The Huntsman & the Ice Queen"                       
##  [75] "Dangal: Die Hoffnung auf den großen Sieg"           
##  [76] "Inferno"                                            
##  [77] "Jason Bourne"                                       
##  [78] "The Founder"                                        
##  [79] "The Purge: Election Year"                           
##  [80] "Assassin's Creed"                                   
##  [81] "Independence Day: Wiederkehr"                       
##  [82] "Deepwater Horizon"                                  
##  [83] "The Great Wall"                                     
##  [84] "Silence"                                            
##  [85] "Die Bestimmung - Allegiant"                         
##  [86] "Wo die wilden Menschen jagen"                       
##  [87] "Gods of Egypt"                                      
##  [88] "Boston"                                             
##  [89] "Batman v Superman: Dawn of Justice Ultimate Edition"
##  [90] "Hail, Caesar!"                                      
##  [91] "Der Spion und sein Bruder"                          
##  [92] "How to Be Single"                                   
##  [93] "Sully"                                              
##  [94] "Everybody Wants Some!!"                             
##  [95] "Jack Reacher: Kein Weg zurück"                      
##  [96] "Before I Wake"                                      
##  [97] "Trolls"                                             
##  [98] "Koe no katachi"                                     
##  [99] "The Void"                                           
## [100] "Pets"

Scrape description data and remove NA’s

#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

#Let's have a look at the description data
head(description_data)
## [1] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                                 
## [2] "\nIn a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
## [3] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [4] "\nA wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                        
## [5] "\nA woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her."                                                                                                               
## [6] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

#Let's have another look at the description data 
head(description_data)
## [1] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                                 
## [2] "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
## [3] "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [4] "A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                        
## [5] "A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her."                                                                                                               
## [6] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."

Scrape the movie runtime

#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

#Let's have a look at the runtime
head(runtime_data)
## [1] "85 min"  "133 min" "108 min" "108 min" "145 min" "123 min"
#Data-Preprocessing: removing mins and converting it to numerical

runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

#Let's have another look at the runtime data
head(runtime_data)
## [1]  85 133 108 108 145 123

Scrape the movie genre

#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

#Let's have a look at the runtime
head(genre_data)
## [1] "\nHorror, Thriller            "          
## [2] "\nAction, Adventure, Sci-Fi            " 
## [3] "\nAnimation, Comedy, Family            " 
## [4] "\nAction, Adventure, Comedy            " 
## [5] "\nDrama, Romance, Thriller            "  
## [6] "\nAction, Adventure, Fantasy            "
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

#Convering each genre from text to factor
genre_data<-as.factor(genre_data)

#Let's have another look at the genre data
head(genre_data)
## [1] Horror    Action    Animation Action    Drama     Action   
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror

Scrape the IMDB rating

#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

#Let's have a look at the ratings
head(rating_data)
## [1] "5.6" "7.8" "7.1" "8.0" "8.1" "5.9"
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

#Let's have another look at the ratings data
head(rating_data)
## [1] 5.6 7.8 7.1 8.0 8.1 5.9

Scrape the votes

#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

#Converting the votes data to text
votes_data <- html_text(votes_data_html)

#Let's have a look at the votes data
head(votes_data)
## [1] "25,977"    "630,960"   "169,937"   "1,028,962" "146,512"   "683,422"
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

#Let's have another look at the votes data
head(votes_data)
## [1]   25977  630960  169937 1028962  146512  683422

Scrape the directors and actors

#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

#Converting the directors data to text
directors_data <- html_text(directors_data_html)

#Let's have a look at the directors data
head(directors_data)
## [1] "Damien Leone"   "Gareth Edwards" "Garth Jennings" "Tim Miller"    
## [5] "Park Chan-wook" "David Ayer"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)

## Actors

#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

#Let's have a look at the actors data
head(actors_data)
## [1] "Jenna Kanell"        "Felicity Jones"      "Matthew McConaughey"
## [4] "Ryan Reynolds"       "Kim Min-hee"         "Will Smith"

Scrape metascore data

Find metascore data with missing values and replace with NAs (this is an automated method)

ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>%
  # scrape the ratings bar and convert to text
  html_text2()
head(ratings_bar_data) 
## [1] "5.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.6/10 X "              
## [2] "7.8\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.8/10 X \n65 Metascore"
## [3] "7.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.1/10 X \n59 Metascore"
## [4] "8.0\nRate this\n 1 2 3 4 5 6 7 8 9 10 8/10 X \n65 Metascore"  
## [5] "8.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 8.1/10 X \n84 Metascore"
## [6] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%
  # extract Metascore
  str_match("\\d{2}") %>%
  as.numeric() # convert to number 

length(metascore_data)
## [1] 100
metascore_data
##   [1] NA 65 59 65 84 40 62 71 71 52 81 94 65 60 70 81 NA 73 67 74 48 67 33 51 72
##  [26] 75 82 79 78 45 44 66 81 57 41 76 58 96 65 54 88 68 72 26 60 99 65 57 42 NA
##  [51] 47 48 51 59 78 66 81 35 32 60 64 44 68 52 42 51 NA 48 62 21 69 47 77 35 NA
##  [76] 42 58 66 55 36 32 68 42 79 33 81 25 69 NA 72 44 51 74 83 47 68 55 78 62 61
summary(metascore_data) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   48.00   62.00   60.61   72.00   99.00       6

Scrape gross earnnings

Find the missing gross earnings (automated) Earnings are part of the votes bar in the html, scrape the votes bar and extract earnings with a regular expression to get the NAs in context.

# scrape the votess bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
  html_text2()
head(votes_bar_data) # look at the votes bar data
## [1] "Votes: 25,977"                      "Votes: 630,960 | Gross: $532.18M"  
## [3] "Votes: 169,937 | Gross: $270.40M"   "Votes: 1,028,962 | Gross: $363.07M"
## [5] "Votes: 146,512 | Gross: $2.01M"     "Votes: 683,422 | Gross: $325.10M"
gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign

gross_data <- substring(gross_data,2,6) %>% # clean data: remove '$' sign
  as.numeric() 
length(gross_data) 
## [1] 100
head(gross_data)
## [1]     NA 532.10 270.40 363.00   2.01 325.10

Create dataframe

#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data, Description = description_data, Runtime = runtime_data, Genre = genre_data, Rating = rating_data, Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data, Director = directors_data, Actor = actors_data)

#Structure of the data frame
str(movies_df)
## 'data.frame':    100 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr  "Terrifier" "Star Wars: Rogue One" "Sing" "Deadpool" ...
##  $ Description         : chr  "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death St"| __truncated__ "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing compe"| __truncated__ "A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man"| __truncated__ ...
##  $ Runtime             : num  85 133 108 108 145 123 117 88 139 144 ...
##  $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 8 1 3 1 7 1 8 6 4 1 ...
##  $ Rating              : num  5.6 7.8 7.1 8 8.1 5.9 7.3 7.1 8.1 6.9 ...
##  $ Metascore           : num  NA 65 59 65 84 40 62 71 71 52 ...
##  $ Votes               : num  25977 630960 169937 1028962 146512 ...
##  $ Gross_Earning_in_Mil: num  NA 532.1 270.4 363 2.01 ...
##  $ Director            : Factor w/ 95 levels "Adam Wingard",..: 19 32 34 90 67 23 53 31 57 11 ...
##  $ Actor               : chr  "Jenna Kanell" "Felicity Jones" "Matthew McConaughey" "Ryan Reynolds" ...

Question 1: Based on the above data, which movie from which genre had the longest runtime?

p1 <- movies_df %>%
  ggplot(aes(x=Runtime, fill=Genre)) +
  geom_histogram(position = "identity", alpha=0.5, binwidth = 5, color = "white") +
  scale_fill_discrete(name = "Genre") +
  labs(title = "Top 100 Movies of 2016 Runtime by Genre")
ggplotly(p1)
highestRuntime <- movies_df %>%
  select(Title, Genre, Runtime) %>%
  arrange(desc(Runtime))
head(highestRuntime)
##                                                 Title  Genre Runtime
## 1 Batman v Superman: Dawn of Justice Ultimate Edition Action     182
## 2            Dangal: Die Hoffnung auf den großen Sieg Action     161
## 3                                             Silence  Drama     161
## 4                         The Wailing: Die Besessenen  Drama     156
## 5                  Batman v Superman: Dawn of Justice Action     152
## 6                        The First Avenger: Civil War Action     147

Batman v Superman: Dawn of Justice Ultimate Edition from the Action genre has the highest runtime of 182 minutes.

Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

p2 <- movies_df %>%
  ggplot(aes(x=Runtime, y=Rating)) +
  geom_point(aes(size = Votes, col = Genre, text = paste("Movie Title:", title_data)), alpha = 0.7) +
  labs(title = "Top 100 Movies of 2016 Runtime by Ratings")
## Warning: Ignoring unknown aesthetics: text
ggplotly(p2)
highestVotes <- movies_df %>%
  filter(Runtime >= 130 & Runtime <= 160) %>%
  select(Title, Genre, Runtime, Votes) %>%
  arrange(desc(Votes))
head(highestVotes)
##                                               Title     Genre Runtime  Votes
## 1                      The First Avenger: Civil War    Action     147 781401
## 2                Batman v Superman: Dawn of Justice    Action     152 697251
## 3                              Star Wars: Rogue One    Action     133 630960
## 4                  Hacksaw Ridge - Die Entscheidung Biography     139 521030
## 5 Phantastische Tierwesen und wo sie zu finden sind Adventure     132 474239
## 6                                 X-Men: Apocalypse    Action     144 434600

Captain America: Civil War in the Action genre has the highest votes of 781,401 with a runtime of 147 minutes.

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

p3 <- movies_df %>%
  ggplot(aes(x=Runtime, y=Gross_Earning_in_Mil)) +
  geom_point(aes(size = Rating, col = Genre), alpha = 0.5) +
  labs(title = "Top 100 Movies of 2016 Runtime by Gross Earnings in Millions") +
  scale_y_continuous("Gross Earnings in Millions", limits = c(-10,600))
ggplotly(p3)
highestEarnings <- movies_df %>%
  filter(Runtime >= 100 & Runtime <= 120 & !is.na(Gross_Earning_in_Mil)) %>%
  select(Genre, Gross_Earning_in_Mil) %>%
  group_by(Genre) %>%
  summarize(avgGrossEarn = mean(Gross_Earning_in_Mil)) %>%
  arrange(desc(avgGrossEarn))
head(highestEarnings)
## # A tibble: 6 × 2
##   Genre     avgGrossEarn
##   <fct>            <dbl>
## 1 Animation        216. 
## 2 Adventure        185. 
## 3 Action            87.8
## 4 Drama             55.2
## 5 Crime             51.1
## 6 Horror            46.8

The Animation genre has the highest average gross earnings with $216.33 million.