Install packages for the project

#install.packages('rvest')
#Loading the rvest package
library(rvest)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6      v purrr   0.3.4 
## v tibble  3.1.8      v dplyr   1.0.10
## v tidyr   1.2.1      v stringr 1.4.1 
## v readr   2.1.2      v forcats 0.5.2 
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Scrape the IMDB website to create a dataframe of information from 2016 top movies

Use the following URL from IMBD movies of 2016

http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature

# Specifying the url for desired website to be scraped

url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

# Reading the HTML code from the website

webpage <- read_html(url)
# save_URL(webpage,filename="webpage.html")

Load various elements and clean data using gsub.

Scrape for Movie Bank Information

# Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

# Converting the ranking data to text
rank_data <- html_text(rank_data_html)

# Let's have a look at the rankings
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
class(rank_data)
## [1] "character"
# Data-Reprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

# Let's have another look at the rankings
head(rank_data)
## [1] 1 2 3 4 5 6
length(rank_data)
## [1] 100

Scrape for Title information

#Using CSS selectors to scrape the title section

title_data_html <- html_nodes(webpage,'.lister-item-header a')

# Converting the title data to text

title_data <- html_text(title_data_html)

# Let's have a look at the title
head(title_data)
## [1] "Terrifier"                    "Rogue One: A Star Wars Story"
## [3] "Sing"                         "Suicide Squad"               
## [5] "Deadpool"                     "The Handmaiden"
length(title_data)
## [1] 100

Scrape for movie description information

# Using CSS selectors to scrape the description section

description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

# Converting the description data to text

description_data <- html_text(description_data_html)

# Let's have a look at the description data

head(description_data)
## [1] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                                 
## [2] "\nIn a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
## [3] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [4] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                          
## [5] "\nA wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                        
## [6] "\nA woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her."
# Data-Preprocessing: removing '\n'

description_data<-gsub("\n","",description_data)

# Let's have another look at the description data

head(description_data)
## [1] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                                 
## [2] "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
## [3] "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [4] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                          
## [5] "A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                        
## [6] "A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her."
length (description_data)
## [1] 100

Scrape for movie run times

# Using CSS selectors to scrape the Movie runtime section

runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

# Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

# Let's have a look at the runtime
head(runtime_data)
## [1] "85 min"  "133 min" "108 min" "123 min" "108 min" "145 min"
# Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

# Let's have another look at the runtime data
head(runtime_data)
## [1]  85 133 108 123 108 145
length(runtime_data)
## [1] 100

Scrape for movie genre information

# Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

# Converting the genre data to text
genre_data <- html_text(genre_data_html)

# Let's have a look at the runtime
head(genre_data)
## [1] "\nHorror, Thriller            "          
## [2] "\nAction, Adventure, Sci-Fi            " 
## [3] "\nAnimation, Comedy, Family            " 
## [4] "\nAction, Adventure, Fantasy            "
## [5] "\nAction, Adventure, Comedy            " 
## [6] "\nDrama, Romance, Thriller            "
# Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

# Data-Preprocessing: removing excess spaces

genre_data<-gsub(" ","",genre_data)

# taking only the first genre of each movie

genre_data<-gsub(",.*","",genre_data)

# Convering each genre from text to factor
genre_data<-as.factor(genre_data)

# Let's have another look at the genre data

head(genre_data)
## [1] Horror    Action    Animation Action    Action    Drama    
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
length(genre_data)
## [1] 100

Scrape for movie Rating information

# Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

# Converting the ratings data to text
rating_data <- html_text(rating_data_html)

# Let's have a look at the ratings
head(rating_data)
## [1] "5.6" "7.8" "7.1" "5.9" "8.0" "8.1"
# Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

# Let's have another look at the ratings data
head(rating_data)
## [1] 5.6 7.8 7.1 5.9 8.0 8.1
length(rating_data)
## [1] 100

Scrape for voting information

# Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

# Converting the votes data to text
votes_data <- html_text(votes_data_html)

# Let's have a look at the votes data
head(votes_data)
## [1] "27,280"    "631,303"   "170,086"   "683,616"   "1,029,413" "146,610"
# Data-Preprocessing: removing commas

votes_data<-gsub(",","",votes_data)

# Data-Preprocessing: converting votes to numerical

votes_data<-as.numeric(votes_data)

# Let's have another look at the votes data

head(votes_data)
## [1]   27280  631303  170086  683616 1029413  146610
length(votes_data)
## [1] 100

Scrape for movie director information

# Using CSS selectors to scrape the directors section

directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

# Converting the directors data to text

directors_data <- html_text(directors_data_html)

# Let's have a look at the directors data

head(directors_data)
## [1] "Damien Leone"   "Gareth Edwards" "Garth Jennings" "David Ayer"    
## [5] "Tim Miller"     "Park Chan-wook"
# Data-Preprocessing: converting directors data into factors

directors_data<-as.factor(directors_data)
length(directors_data)
## [1] 100

Scrape for movie actor information

# Using CSS selectors to scrape the actors section

actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

# Converting the gross actors data to text

actors_data <- html_text(actors_data_html)

# Let's have a look at the actors data

head(actors_data)
## [1] "Jenna Kanell"        "Felicity Jones"      "Matthew McConaughey"
## [4] "Will Smith"          "Ryan Reynolds"       "Kim Min-hee"
# Data-Preprocessing: converting actors data into factors

actors_data<-as.factor(actors_data)
length(actors_data)
## [1] 100

Find metascore data with missing values and replace with NAs

ratings_bar_data<-html_nodes(webpage,'.ratings-bar')%>%
# scrape the ratings bar and convert to text
  html_text2()
head(ratings_bar_data)
## [1] "5.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.6/10 X "              
## [2] "7.8\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.8/10 X \n65 Metascore"
## [3] "7.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.1/10 X \n59 Metascore"
## [4] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
## [5] "8.0\nRate this\n 1 2 3 4 5 6 7 8 9 10 8/10 X \n65 Metascore"  
## [6] "8.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 8.1/10 X \n84 Metascore"
metascore_data<- str_match(ratings_bar_data,"\\d{2} Metascore")%>%
  str_match("\\d{2}")%>%
  as.numeric()

length (metascore_data)
## [1] 100
metascore_data
##   [1] NA 65 59 40 65 84 62 65 60 44 71 81 73 71 57 66 94 81 70 67 81 51 72 67 76
##  [26] 52 79 74 66 48 78 51 75 33 47 58 96 82 65 42 65 45 41 47 42 68 72 26 57 88
##  [51] 64 99 48 60 59 81 44 NA 62 51 35 54 35 52 32 NA 55 69 NA NA 68 21 44 25 79
##  [76] 78 47 NA 58 66 77 81 42 74 42 61 48 68 NA 34 60 67 69 32 28 23 51 58 33 76
summary(metascore_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   47.00   61.00   59.45   72.00   99.00       7

Find the missing gross earnings

#Scrape the votes bar and convert to text
votes_bar_data<-html_nodes(webpage,'.sort-num_votes-visible')%>%
  html_text2()
head(votes_bar_data)
## [1] "Votes: 27,280"                      "Votes: 631,303 | Gross: $532.18M"  
## [3] "Votes: 170,086 | Gross: $270.40M"   "Votes: 683,616 | Gross: $325.10M"  
## [5] "Votes: 1,029,413 | Gross: $363.07M" "Votes: 146,610 | Gross: $2.01M"
gross_data<- str_match(votes_bar_data,"\\$.+$")

gross_data<-gsub ("M","",gross_data)
gross_data<-substring(gross_data,2,6)%>%
  as.numeric()
length(gross_data)
## [1] 100

Combining all the lists to form a data frame

movies_df<-data.frame(Rank = rank_data, Title = title_data,

Description = description_data, Runtime = runtime_data,

Genre = genre_data, Rating = rating_data,

Metascore = metascore_data, Votes = votes_data, 

Gross_Earning_in_Mil = gross_data,

Director = directors_data, Actor = actors_data)

# Structure of the data frame

str(movies_df)
## 'data.frame':    100 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr  "Terrifier" "Rogue One: A Star Wars Story" "Sing" "Suicide Squad" ...
##  $ Description         : chr  "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death St"| __truncated__ "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing compe"| __truncated__ "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ ...
##  $ Runtime             : num  85 133 108 123 108 145 117 134 117 152 ...
##  $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 8 1 3 1 1 7 8 8 1 1 ...
##  $ Rating              : num  5.6 7.8 7.1 5.9 8 8.1 7.3 7.3 6.9 6.4 ...
##  $ Metascore           : num  NA 65 59 40 65 84 62 65 60 44 ...
##  $ Votes               : num  27280 631303 170086 683616 1029413 ...
##  $ Gross_Earning_in_Mil: num  NA 532 270 325 363 ...
##  $ Director            : Factor w/ 96 levels "Adam Wingard",..: 22 34 36 26 90 67 54 44 68 96 ...
##  $ Actor               : Factor w/ 92 levels "Aamir Khan","Alexander Skarsgård",..: 43 33 63 92 76 53 42 90 64 7 ...

Question 1

# p1<- qplot(data = movies_df,Runtime,fill = Genre,bins = 30)
p1<-movies_df %>%
  ggplot(aes(x=Runtime,fill=Genre))+
  geom_histogram(position="identity", alpha=0.5,binwidth = 5,color="white")+
  scale_fill_discrete(name="Genre")+
  labs(title="Top 100 Movies of 2016 Runtime by Genre")
ggplotly(p1)
movies_df %>%
  rownames_to_column(var="Name")%>%
  filter(Runtime==max(Runtime))
##   Name Rank                                                 Title
## 1   58   58 Batman v Superman: Dawn of Justice - Ultimate Edition
##                                                                                                                                                                                                                               Description
## 1 Batman is manipulated by Lex Luthor to fear Superman. Superman´s existence is meanwhile dividing the world and he is framed for murder during an international crisis. The heroes clash and force the neutral Wonder Woman to reemerge.
##   Runtime  Genre Rating Metascore Votes Gross_Earning_in_Mil    Director
## 1     182 Action    7.1        NA 58809                   NA Zack Snyder
##       Actor
## 1 Amy Adams

Answer

Batman v Superman: Dawn of Justice from Action genre is the single movie that had the longest runtime.

Question 2

p2<-movies_df %>%
  ggplot(aes(x=Runtime,y=Rating))+
  geom_point(aes(size=Votes,col=Genre,text=paste("Movie Title:",title_data)),alpha=0.7)+
  labs(title="Top 100 Movies of 2016 Runtime by Ratings")
## Warning: Ignoring unknown aesthetics: text
ggplotly(p2)
movies_df %>%
  rownames_to_column(var="Name")%>%
  filter (Runtime>=130 & Runtime <=160)%>%
  filter (Votes==max(Votes))
##   Name Rank                      Title
## 1   33   33 Captain America: Civil War
##                                                                                          Description
## 1 Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man.
##   Runtime  Genre Rating Metascore  Votes Gross_Earning_in_Mil      Director
## 1     147 Action    7.8        75 781689                  408 Anthony Russo
##         Actor
## 1 Chris Evans

Answer:

In the Runtime of 130-160mins, Captain America: Civil War from the Action genre has the highest votes.

Question 3

p3<-movies_df %>%
  ggplot(aes(x=Runtime,y=Gross_Earning_in_Mil))+
  geom_point(aes(size=Rating,col=Genre),alpha=0.5)+
  labs(title = "Top 100 Movies of 2016 Runtime by Gross Earnings in Millions")+
  scale_y_continuous("Gross Earnings in Millions",limits = c(-10,600))
ggplotly(p3)
movies_df %>%
  rownames_to_column(var="Name")%>%
  filter(Runtime>=100 & Runtime <=120 &! is.na (Gross_Earning_in_Mil))%>%
  group_by(Genre)%>%
  summarise(averageGross=mean(Gross_Earning_in_Mil))%>%
  filter(averageGross==max(averageGross))
## # A tibble: 1 x 2
##   Genre     averageGross
##   <fct>            <dbl>
## 1 Animation         216.

Answer

Based on the above data, Animation has the highest average gross earnings in runtime 100 to 120.