Install packages for the project

#install.packages('rvest')
#Loading the rvest package
library(rvest)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6      v purrr   0.3.4 
## v tibble  3.1.8      v dplyr   1.0.10
## v tidyr   1.2.1      v stringr 1.4.1 
## v readr   2.1.2      v forcats 0.5.2 
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Scrape the IMDB website to create a dataframe of information from 2016 top movies

Use the following URL from IMBD movies of 2016

http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature

# Specifying the url for desired website to be scraped

url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

# Reading the HTML code from the website

webpage <- read_html(url)
# save_URL(webpage,filename="webpage.html")

Load various elements and clean data using gsub.

Scrape for Movie Bank Information

# Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

# Converting the ranking data to text
rank_data <- html_text(rank_data_html)

# Let's have a look at the rankings
head(rank_data)

## [1] "1." "2." "3." "4." "5." "6."

class(rank_data)

## [1] "character"

# Data-Reprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

# Let's have another look at the rankings
head(rank_data)

## [1] 1 2 3 4 5 6

length(rank_data)

## [1] 100

Scrape for Title information

#Using CSS selectors to scrape the title section

title_data_html <- html_nodes(webpage,'.lister-item-header a')

# Converting the title data to text

title_data <- html_text(title_data_html)

# Let's have a look at the title
head(title_data)

## [1] "Terrifier"                    "Rogue One: A Star Wars Story"
## [3] "Sing"                         "Suicide Squad"               
## [5] "Deadpool"                     "The Handmaiden"

length(title_data)

## [1] 100

Scrape for movie description information

# Using CSS selectors to scrape the description section

description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

# Converting the description data to text

description_data <- html_text(description_data_html)

# Let's have a look at the description data

head(description_data)

## [1] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                                 
## [2] "\nIn a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
## [3] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [4] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                          
## [5] "\nA wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                        
## [6] "\nA woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her."

# Data-Preprocessing: removing '\n'

description_data<-gsub("\n","",description_data)

# Let's have another look at the description data

head(description_data)

## [1] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                                 
## [2] "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
## [3] "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [4] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                          
## [5] "A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                        
## [6] "A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her."

length (description_data)

## [1] 100

Scrape for movie run times

# Using CSS selectors to scrape the Movie runtime section

runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

# Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

# Let's have a look at the runtime
head(runtime_data)

## [1] "85 min"  "133 min" "108 min" "123 min" "108 min" "145 min"

# Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

# Let's have another look at the runtime data
head(runtime_data)

## [1]  85 133 108 123 108 145

length(runtime_data)

## [1] 100

Scrape for movie genre information

# Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

# Converting the genre data to text
genre_data <- html_text(genre_data_html)

# Let's have a look at the runtime
head(genre_data)

## [1] "\nHorror, Thriller            "          
## [2] "\nAction, Adventure, Sci-Fi            " 
## [3] "\nAnimation, Comedy, Family            " 
## [4] "\nAction, Adventure, Fantasy            "
## [5] "\nAction, Adventure, Comedy            " 
## [6] "\nDrama, Romance, Thriller            "

# Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

# Data-Preprocessing: removing excess spaces

genre_data<-gsub(" ","",genre_data)

# taking only the first genre of each movie

genre_data<-gsub(",.*","",genre_data)

# Convering each genre from text to factor
genre_data<-as.factor(genre_data)

# Let's have another look at the genre data

head(genre_data)

## [1] Horror    Action    Animation Action    Action    Drama    
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror

length(genre_data)

## [1] 100

Scrape for movie Rating information

# Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

# Converting the ratings data to text
rating_data <- html_text(rating_data_html)

# Let's have a look at the ratings
head(rating_data)

## [1] "5.6" "7.8" "7.1" "5.9" "8.0" "8.1"

# Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

# Let's have another look at the ratings data
head(rating_data)

## [1] 5.6 7.8 7.1 5.9 8.0 8.1

length(rating_data)

## [1] 100

Scrape for voting information

# Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

# Converting the votes data to text
votes_data <- html_text(votes_data_html)

# Let's have a look at the votes data
head(votes_data)

## [1] "27,280"    "631,303"   "170,086"   "683,616"   "1,029,413" "146,610"

# Data-Preprocessing: removing commas

votes_data<-gsub(",","",votes_data)

# Data-Preprocessing: converting votes to numerical

votes_data<-as.numeric(votes_data)

# Let's have another look at the votes data

head(votes_data)

## [1]   27280  631303  170086  683616 1029413  146610

length(votes_data)

## [1] 100

Scrape for movie director information

# Using CSS selectors to scrape the directors section

directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

# Converting the directors data to text

directors_data <- html_text(directors_data_html)

# Let's have a look at the directors data

head(directors_data)

## [1] "Damien Leone"   "Gareth Edwards" "Garth Jennings" "David Ayer"    
## [5] "Tim Miller"     "Park Chan-wook"

# Data-Preprocessing: converting directors data into factors

directors_data<-as.factor(directors_data)
length(directors_data)

## [1] 100

Scrape for movie actor information

# Using CSS selectors to scrape the actors section

actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

# Converting the gross actors data to text

actors_data <- html_text(actors_data_html)

# Let's have a look at the actors data

head(actors_data)

## [1] "Jenna Kanell"        "Felicity Jones"      "Matthew McConaughey"
## [4] "Will Smith"          "Ryan Reynolds"       "Kim Min-hee"

# Data-Preprocessing: converting actors data into factors

actors_data<-as.factor(actors_data)
length(actors_data)

## [1] 100

Find metascore data with missing values and replace with NAs

ratings_bar_data<-html_nodes(webpage,'.ratings-bar')%>%
# scrape the ratings bar and convert to text
  html_text2()
head(ratings_bar_data)

## [1] "5.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.6/10 X "              
## [2] "7.8\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.8/10 X \n65 Metascore"
## [3] "7.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.1/10 X \n59 Metascore"
## [4] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
## [5] "8.0\nRate this\n 1 2 3 4 5 6 7 8 9 10 8/10 X \n65 Metascore"  
## [6] "8.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 8.1/10 X \n84 Metascore"

metascore_data<- str_match(ratings_bar_data,"\\d{2} Metascore")%>%
  str_match("\\d{2}")%>%
  as.numeric()

length (metascore_data)

## [1] 100

metascore_data

##   [1] NA 65 59 40 65 84 62 65 60 44 71 81 73 71 57 66 94 81 70 67 81 51 72 67 76
##  [26] 52 79 74 66 48 78 51 75 33 47 58 96 82 65 42 65 45 41 47 42 68 72 26 57 88
##  [51] 64 99 48 60 59 81 44 NA 62 51 35 54 35 52 32 NA 55 69 NA NA 68 21 44 25 79
##  [76] 78 47 NA 58 66 77 81 42 74 42 61 48 68 NA 34 60 67 69 32 28 23 51 58 33 76

summary(metascore_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   47.00   61.00   59.45   72.00   99.00       7

Find the missing gross earnings

#Scrape the votes bar and convert to text
votes_bar_data<-html_nodes(webpage,'.sort-num_votes-visible')%>%
  html_text2()
head(votes_bar_data)

## [1] "Votes: 27,280"                      "Votes: 631,303 | Gross: $532.18M"  
## [3] "Votes: 170,086 | Gross: $270.40M"   "Votes: 683,616 | Gross: $325.10M"  
## [5] "Votes: 1,029,413 | Gross: $363.07M" "Votes: 146,610 | Gross: $2.01M"

gross_data<- str_match(votes_bar_data,"\\$.+$")

gross_data<-gsub ("M","",gross_data)
gross_data<-substring(gross_data,2,6)%>%
  as.numeric()
length(gross_data)

## [1] 100

Combining all the lists to form a data frame

movies_df<-data.frame(Rank = rank_data, Title = title_data,

Description = description_data, Runtime = runtime_data,

Genre = genre_data, Rating = rating_data,

Metascore = metascore_data, Votes = votes_data, 

Gross_Earning_in_Mil = gross_data,

Director = directors_data, Actor = actors_data)

# Structure of the data frame

str(movies_df)

## 'data.frame':    100 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr  "Terrifier" "Rogue One: A Star Wars Story" "Sing" "Suicide Squad" ...
##  $ Description         : chr  "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death St"| __truncated__ "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing compe"| __truncated__ "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ ...
##  $ Runtime             : num  85 133 108 123 108 145 117 134 117 152 ...
##  $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 8 1 3 1 1 7 8 8 1 1 ...
##  $ Rating              : num  5.6 7.8 7.1 5.9 8 8.1 7.3 7.3 6.9 6.4 ...
##  $ Metascore           : num  NA 65 59 40 65 84 62 65 60 44 ...
##  $ Votes               : num  27280 631303 170086 683616 1029413 ...
##  $ Gross_Earning_in_Mil: num  NA 532 270 325 363 ...
##  $ Director            : Factor w/ 96 levels "Adam Wingard",..: 22 34 36 26 90 67 54 44 68 96 ...
##  $ Actor               : Factor w/ 92 levels "Aamir Khan","Alexander Skarsgård",..: 43 33 63 92 76 53 42 90 64 7 ...

Question 1

# p1<- qplot(data = movies_df,Runtime,fill = Genre,bins = 30)
p1<-movies_df %>%
  ggplot(aes(x=Runtime,fill=Genre))+
  geom_histogram(position="identity", alpha=0.5,binwidth = 5,color="white")+
  scale_fill_discrete(name="Genre")+
  labs(title="Top 100 Movies of 2016 Runtime by Genre")
ggplotly(p1)

movies_df %>%
  rownames_to_column(var="Name")%>%
  filter(Runtime==max(Runtime))

##   Name Rank                                                 Title
## 1   58   58 Batman v Superman: Dawn of Justice - Ultimate Edition
##                                                                                                                                                                                                                               Description
## 1 Batman is manipulated by Lex Luthor to fear Superman. Superman´s existence is meanwhile dividing the world and he is framed for murder during an international crisis. The heroes clash and force the neutral Wonder Woman to reemerge.
##   Runtime  Genre Rating Metascore Votes Gross_Earning_in_Mil    Director
## 1     182 Action    7.1        NA 58809                   NA Zack Snyder
##       Actor
## 1 Amy Adams

Answer

Batman v Superman: Dawn of Justice from Action genre is the single movie that had the longest runtime.

Question 2

p2<-movies_df %>%
  ggplot(aes(x=Runtime,y=Rating))+
  geom_point(aes(size=Votes,col=Genre,text=paste("Movie Title:",title_data)),alpha=0.7)+
  labs(title="Top 100 Movies of 2016 Runtime by Ratings")

## Warning: Ignoring unknown aesthetics: text

ggplotly(p2)

movies_df %>%
  rownames_to_column(var="Name")%>%
  filter (Runtime>=130 & Runtime <=160)%>%
  filter (Votes==max(Votes))

##   Name Rank                      Title
## 1   33   33 Captain America: Civil War
##                                                                                          Description
## 1 Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man.
##   Runtime  Genre Rating Metascore  Votes Gross_Earning_in_Mil      Director
## 1     147 Action    7.8        75 781689                  408 Anthony Russo
##         Actor
## 1 Chris Evans

Answer:

In the Runtime of 130-160mins, Captain America: Civil War from the Action genre has the highest votes.

Question 3

p3<-movies_df %>%
  ggplot(aes(x=Runtime,y=Gross_Earning_in_Mil))+
  geom_point(aes(size=Rating,col=Genre),alpha=0.5)+
  labs(title = "Top 100 Movies of 2016 Runtime by Gross Earnings in Millions")+
  scale_y_continuous("Gross Earnings in Millions",limits = c(-10,600))
ggplotly(p3)

movies_df %>%
  rownames_to_column(var="Name")%>%
  filter(Runtime>=100 & Runtime <=120 &! is.na (Gross_Earning_in_Mil))%>%
  group_by(Genre)%>%
  summarise(averageGross=mean(Gross_Earning_in_Mil))%>%
  filter(averageGross==max(averageGross))

## # A tibble: 1 x 2
##   Genre     averageGross
##   <fct>            <dbl>
## 1 Animation         216.

Answer

Based on the above data, Animation has the highest average gross earnings in runtime 100 to 120.

Webscraping Assignment

Claudia KOUMBA

2022-11-01

Install packages for the project

Scrape the IMDB website to create a dataframe of information from 2016 top movies

Use the following URL from IMBD movies of 2016

Load various elements and clean data using gsub.

Scrape for Movie Bank Information

Scrape for Title information

Scrape for movie description information

Scrape for movie run times

Scrape for movie genre information

Scrape for movie Rating information

Scrape for voting information

Scrape for movie director information

Scrape for movie actor information

Find metascore data with missing values and replace with NAs

Find the missing gross earnings

Combining all the lists to form a data frame

Question 1

Answer

Question 2

Answer:

Question 3

Answer