Webscraping Assignment

Install nesccary packages

library(rvest)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyr)
library(dplyr)

Use the following URL from IMBD movies of 2016

#Specifying the url for desired website to be scraped
url <- 'https://www.imdb.com/search/title/?count=100&release_date=2016,2016&title_type=feature'
 
#Reading the HTML code from the website
webpage <- read_html(url)
# save_url(webpage, filename="webpage.html")

Load various elements and clean data using gsub.

Scrape for Movie Rank Information

Use the command, length, to ensure that each list contains 100 elements or NAs for missing data to sum to 100 elements.

#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary') #Converting the ranking data to text
rank_data <- html_text(rank_data_html)
#Let's have a look at the rankings
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
## [1] "1." "2." "3." "4." "5." "6."
#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)
#Let's have another look at the rankings
head(rank_data)
## [1] 1 2 3 4 5 6
## [1] 1 2 3 4 5 6
length(rank_data)
## [1] 100
## [1] 100

Scrape for Title Information

#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')
#Converting the title data to text
title_data <- html_text(title_data_html)
#Let's have a look at the title
head(title_data)
## [1] "Doctor Strange"                         
## [2] "Sing"                                   
## [3] "Rogue One: A Star Wars Story"           
## [4] "Deadpool"                               
## [5] "Suicide Squad"                          
## [6] "Fantastic Beasts and Where to Find Them"
length(title_data)
## [1] 100

Scrape for Movie Information

#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted') #Converting the description data to text
description_data <- html_text(description_data_html)
#Let's have a look at the description data
head(description_data)
## [1] "\nWhile on a journey of physical and spiritual healing, a brilliant neurosurgeon is drawn into the world of the mystic arts."                                                                                                   
## [2] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [3] "\nIn a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
## [4] "\nA wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                        
## [5] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                          
## [6] "\nThe adventures of writer Newt Scamander in New York's secret community of witches and wizards seventy years before Harry Potter reads his book in school."
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)
#Let's have another look at the description data
head(description_data)
## [1] "While on a journey of physical and spiritual healing, a brilliant neurosurgeon is drawn into the world of the mystic arts."                                                                                                   
## [2] "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [3] "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
## [4] "A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                        
## [5] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                          
## [6] "The adventures of writer Newt Scamander in New York's secret community of witches and wizards seventy years before Harry Potter reads his book in school."
length(description_data)
## [1] 100

Scrape for Movie Run Times

#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime') #Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
#Let's have a look at the runtime
head(runtime_data)
## [1] "115 min" "108 min" "133 min" "108 min" "123 min" "132 min"
#Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 115 108 133 108 123 132
length(runtime_data)
## [1] 100

Scrape for Movie Genre Information

#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre') #Converting the genre data to text
genre_data <- html_text(genre_data_html)
#Let's have a look at the runtime
head(genre_data)
## [1] "\nAction, Adventure, Fantasy            "
## [2] "\nAnimation, Comedy, Family            " 
## [3] "\nAction, Adventure, Sci-Fi            " 
## [4] "\nAction, Adventure, Comedy            " 
## [5] "\nAction, Adventure, Fantasy            "
## [6] "\nAdventure, Family, Fantasy            "
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data) #Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data) #taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data) #Convering each genre from text to factor
genre_data<-as.factor(genre_data)
#Let's have another look at the genre data
head(genre_data)
## [1] Action    Animation Action    Action    Action    Adventure
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
length(genre_data)
## [1] 100

Scrape for Movie Rating Information

This information changes as the webpage updates regularly

#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong') #Converting the ratings data to text
rating_data <- html_text(rating_data_html)
#Let's have a look at the ratings
head(rating_data)
## [1] "7.5" "7.1" "7.8" "8.0" "5.9" "7.2"
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)
#Let's have another look at the ratings data
head(rating_data)
## [1] 7.5 7.1 7.8 8.0 5.9 7.2
length(rating_data)
## [1] 100

Scrape for Voting Information

#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)') #Converting the votes data to text
votes_data <- html_text(votes_data_html)
#Let's have a look at the votes data
head(votes_data)
## [1] "717,714"   "164,400"   "613,605"   "1,005,712" "675,419"   "467,210"
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data) #Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)
#Let's have another look at the votes data
head(votes_data)
## [1]  717714  164400  613605 1005712  675419  467210
length(votes_data)
## [1] 100

Scrape for Movie Director Information

#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)') #Converting the directors data to text
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)
## [1] "Scott Derrickson" "Garth Jennings"   "Gareth Edwards"   "Tim Miller"      
## [5] "David Ayer"       "David Yates"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)
length(directors_data)
## [1] 100

Scrape for Movie Actor Information

#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a') #Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
#Let's have a look at the actors data
head(actors_data)
## [1] "Benedict Cumberbatch" "Matthew McConaughey"  "Felicity Jones"      
## [4] "Ryan Reynolds"        "Will Smith"           "Eddie Redmayne"
#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)
length(actors_data)
## [1] 100

Find metascore data with missing values and replace with NAs

(this is an automated method instead of the fallible method provided in the tutorial)

ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>% 
 html_text2()

head(ratings_bar_data)
## [1] "7.5\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.5/10 X \n72 Metascore"
## [2] "7.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.1/10 X \n59 Metascore"
## [3] "7.8\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.8/10 X \n65 Metascore"
## [4] "8.0\nRate this\n 1 2 3 4 5 6 7 8 9 10 8/10 X \n65 Metascore"  
## [5] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
## [6] "7.2\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.2/10 X \n66 Metascore"
metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%
  str_match("\\d{2}") %>%
  as.numeric()
length(metascore_data)
## [1] 100
metascore_data
##   [1] 72 59 65 65 40 66 71 81 84 74 75 78 51 94 81 57 62 67 51 72 54 88 64 41 52
##  [26] 99 79 66 73 44 57 70 48 47 32 76 32 71 42 42 42 NA 68 70 52 44 NA 81 51 81
##  [51] 25 60 65 58 66 96 77 74 47 67 59 77 60 77 NA 79 69 45 78 61 48 33 40 36 66
##  [76] 52 83 26 81 35 23 65 46 58 79 28 62 66 NA 33 34 78 22 39 62 72 34 59 51 21
summary(metascore_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   45.75   62.00   59.41   73.25   99.00       4

Find the missing gross earnings

(automated - this is also in place of the tutorial method, which has issues)

Earnings are part of the votes bar and extract earning with regular expression to get the NAs in context.

# scrape the votes bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
  html_text2()
head(votes_bar_data) # look at the votes bar data
## [1] "Votes: 717,714 | Gross: $232.64M"   "Votes: 164,400 | Gross: $270.40M"  
## [3] "Votes: 613,605 | Gross: $532.18M"   "Votes: 1,005,712 | Gross: $363.07M"
## [5] "Votes: 675,419 | Gross: $325.10M"   "Votes: 467,210 | Gross: $234.04M"
gross_data <- str_match(votes_bar_data, "\\$.+$")
gross_data <- str_match(votes_bar_data, "\\$.+$")

gross_data <- gsub("M","",gross_data)
gross_data <- substring(gross_data,2,6) %>%
  as.numeric()

length(gross_data)
## [1] 100

Combine all the lists to form a data frame

movies_df<-data.frame(Rank = rank_data, Title = title_data, Description = description_data,
                      Runtime = runtime_data, 
                      Genre = genre_data, Rating = rating_data,
                      Director = directors_data, Actors = actors_data,
                      Metascore = metascore_data, Votes = votes_data,
Gross_Earning_in_Mil = gross_data)

str(movies_df)
## 'data.frame':    100 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr  "Doctor Strange" "Sing" "Rogue One: A Star Wars Story" "Deadpool" ...
##  $ Description         : chr  "While on a journey of physical and spiritual healing, a brilliant neurosurgeon is drawn into the world of the mystic arts." "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing compe"| __truncated__ "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death St"| __truncated__ "A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man"| __truncated__ ...
##  $ Runtime             : num  115 108 133 108 123 132 139 107 145 127 ...
##  $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 1 3 1 1 1 2 4 3 7 4 ...
##  $ Rating              : num  7.5 7.1 7.8 8 5.9 7.2 8.1 7.6 8.1 7.8 ...
##  $ Director            : Factor w/ 98 levels "Alessandro Carloni",..: 86 35 33 94 24 27 63 82 73 92 ...
##  $ Actors              : Factor w/ 92 levels "Adam Sandler",..: 8 58 32 75 90 25 4 6 50 83 ...
##  $ Metascore           : num  72 59 65 65 40 66 71 81 84 74 ...
##  $ Votes               : num  717714 164400 613605 1005712 675419 ...
##  $ Gross_Earning_in_Mil: num  233 270 532 363 325 ...

Question 1: Based on the above data, which genre had the longest runtime?

You can add plotly to get more information on each bar segment. You will also need to include additional code to filter to get the exact movie information

p1 <- movies_df %>%
  
  ggplot(aes(x=Runtime, fill = Genre)) +
  geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
  scale_fill_discrete(name = "Genre") +
  labs(title = "Top 100 Movies of 2016 Runtime by Genre")

p1

## Question 1: Based on the above data, which movie from which genre had the longest runtime? You can add plotly to get more information on each bar segment. You will also need to include additional code to filter to get the exact movie information

Answer Question #1 Here by using filtering

Longest_Movie <- movies_df %>%
  filter(Runtime == max(Runtime)) %>%
  arrange(Title, Genre)
  print(Longest_Movie$Title)
## [1] "Batman v Superman: Dawn of Justice Ultimate Edition"
  print(Longest_Movie$Genre)
## [1] Action
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror

#Question 2: Based on the above data, in the Runtime of 130-160 mins, which movie from which genre has the highest votes? Again, use the filter to get the exact movie which answers this question.

p2 <- movies_df %>%
  ggplot(aes(x=Runtime,y=Rating))+
  geom_point(aes(size=Votes,col=Genre, text = paste("Movie Title:", title_data)), alpha =
0.7) +
  labs(title = "Top 100 Movies of 2016 Runtime by Ratings")
## Warning: Ignoring unknown aesthetics: text
p2

Answer Question #2 Here by using filtering

Highest_Votes <- movies_df %>%
  filter(Runtime >= 130 & Runtime <= 160)%>%
  filter(Votes == max(Votes))%>%
  arrange(Title, Genre)
  print(Highest_Votes$Title)
## [1] "Captain America: Civil War"
  print(Highest_Votes$Genre)
## [1] Action
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

p3 <- movies_df %>%
  ggplot(aes(x=Runtime,y=Gross_Earning_in_Mil))+
  geom_point(aes(size = Rating,col = Genre), alpha = 0.5) +
  labs(title = "Top 100 Movies of 2016 Runtime by Gross Earnings in Millions") +
  scale_y_continuous("Gross Earnings in Millions", limits =c(-10, 600))
p3
## Warning: Removed 13 rows containing missing values (geom_point).

Answer Question #3 Here by using filtering Feel free to make the ggplots interactive using plotly. You will have to answer the 3 questions at the end using some filtering techniques.

genre_df <- data.frame(Genre = movies_df$Genre,Runtime = movies_df$Runtime, Gross_Earning_in_Mil = movies_df$Gross_Earning_in_Mil) %>%
  filter(Runtime >= 100 & Runtime <= 120) 
Highest_Gross <- genre_df %>%
  drop_na()%>%
  group_by(Genre)%>%
  summarise(Gross_Earning_in_Mil = max(mean(Gross_Earning_in_Mil, na.rm = TRUE))) %>%
  filter(Gross_Earning_in_Mil == max(Gross_Earning_in_Mil))
  print(Highest_Gross$Genre)
## [1] Animation
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
  print(Highest_Gross$Gross_Earning_in_Mil)
## [1] 216.33