Web Scraping RMD

Install Necessary Packages for this Project

library("tidyverse")

## Warning: package 'tidyverse' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library("rvest")

## Warning: package 'rvest' was built under R version 4.2.3

## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding

library("ggplot2")

Scrape the IMDB wesite to create a dataframe of information from 2016 top 100 movies

Use the folloring URL from IMBD movies of 2016

#Specifying the url for desired website to be scraped
url <- 'https://www.imdb.com/search/title/?count=100&release_date=2016,2016&title_type=feature'

#Reading the HTML code from the website
webpage <- read_html(url)

Load Various Elements and Clean Data Using Gsub

Scrape for Movie Rank Information

Use the common, length to ensure that each list contains 100 elements or NAS for missing data to sum to 100 elements.

#Using Gadget selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage, '.text-primary')

#Converting the ranking data to text
rank_data <- html_text(rank_data_html)

#Checking the rankings
head('rank_data')

## [1] "rank_data"

#Data-Preprocessing: Converting rankings to numerical
rank_data <- as.numeric(rank_data)

#Double - checking the rank
head(rank_data)

## [1] 1 2 3 4 5 6

length(rank_data)

## [1] 100

Scrape for Movie Titles Information

#Using Gadget selectors to scrape the title section
title_data_html <- html_nodes(webpage, '.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)

#Checking the title
head(title_data)

## [1] "The Magnificent Seven"        "Me Before You"               
## [3] "Rogue One: A Star Wars Story" "Hidden Figures"              
## [5] "Suicide Squad"                "Sing"

length(title_data)

## [1] 100

Scrape for Movie Description Information

#Using Gadget selectors to scrape the descripion section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

#Checking the description
head(description_data)

## [1] "\nSeven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist."                                                          
## [2] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."                                                                                                                            
## [3] "\nIn a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
## [4] "\nThe story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."                                                                              
## [5] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                          
## [6] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."

Scrape for Movie Description Information Removing ‘’

#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

#Double - checking the description
head(description_data)

## [1] "Seven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist."                                                          
## [2] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."                                                                                                                            
## [3] "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."                                                              
## [4] "The story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."                                                                              
## [5] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                          
## [6] "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."

length(description_data)

## [1] 100

Scrape for Movie RunTime Information

#Using Gadget selectors to scrape the Movie runtime sectio
runtine_data_html <- html_nodes(webpage, '.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtine_data_html)

#Checking the run time
head(runtime_data)

## [1] "132 min" "106 min" "133 min" "127 min" "123 min" "108 min"

#Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub("min", "",runtime_data)
runtime_data<-as.numeric(runtime_data)

#Double - checking the run time
head(runtime_data)

## [1] 132 106 133 127 123 108

length(runtime_data)

## [1] 100

Scrape for Movie Genre Information

#Using Gadget selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

#Checking the genre
head(genre_data)

## [1] "\nAction, Adventure, Western            "
## [2] "\nDrama, Romance            "            
## [3] "\nAction, Adventure, Sci-Fi            " 
## [4] "\nBiography, Drama, History            " 
## [5] "\nAction, Adventure, Fantasy            "
## [6] "\nAnimation, Comedy, Family            "

#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

#Data-Preprocessing: removing excess spaces
genre_data<-gsub("","",genre_data)

#taking only the first genre of each movie
genre_data<-gsub(',.*',"",genre_data)

#Convering each genre from text to factor
genre_data<-as.factor(genre_data)

#double - checking genre
head(genre_data)

## [1] Action    Drama     Action    Biography Action    Animation
## 10 Levels: Action Adventure Animation Biography Comedy ... Horror

length(genre_data)

## [1] 100

Scrape for Movue Rating Information

This information changes as the webpage updates regularly

#Using Gadget selectors to scrape the Movie rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the rating data to text
rating_data <- html_text(rating_data_html)

#Checking rating
head(rating_data)

## [1] "6.8" "7.4" "7.8" "7.8" "5.9" "7.1"

#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

#Double - checking rating
head(rating_data)

## [1] 6.8 7.4 7.8 7.8 5.9 7.1

length(rating_data)

## [1] 100

Scrape for Voting Information

#Using Gadget selectors to scrape the Movie rating section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

#Converting the rating votes to text
votes_data <- html_text(votes_data_html)

#Checking votes
head(votes_data)

## [1] "217,181" "263,328" "652,054" "238,331" "695,556" "176,678"

#Data-Preprocessing: removing commas
votes_data<-gsub(',',"",votes_data)

#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

#Double - checking votes
head(votes_data)

## [1] 217181 263328 652054 238331 695556 176678

length(votes_data)

## [1] 100

Scrape for Movie Director Infomartion

#Using Gadget selectors to scrape the Movie rating section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

#Converting the rating votes to text
directors_data <- html_text(directors_data_html)

#checking directors
head(directors_data)

## [1] "Antoine Fuqua"  "Thea Sharrock"  "Gareth Edwards" "Theodore Melfi"
## [5] "David Ayer"     "Garth Jennings"

#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)

#Double - checking director
head(directors_data)

## [1] Antoine Fuqua  Thea Sharrock  Gareth Edwards Theodore Melfi David Ayer    
## [6] Garth Jennings
## 99 Levels: Aisling Walsh Alessandro Carloni Alex Proyas ... Zack Snyder

length(directors_data)

## [1] 100

Scrape For Movies Actor Infomation

#Using Gadget selectors to scrape the Movie rating section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

#Converting the rating votes to text
actors_data <- html_text(actors_data_html)

#Checking actors
head(actors_data)

## [1] "Denzel Washington"   "Emilia Clarke"       "Felicity Jones"     
## [4] "Taraji P. Henson"    "Will Smith"          "Matthew McConaughey"

#Data-Preprocessing: converting actors data into factors
actors_data <-as.factor(actors_data)

#Double - checking actors
head(actors_data)

## [1] Denzel Washington   Emilia Clarke       Felicity Jones     
## [4] Taraji P. Henson    Will Smith          Matthew McConaughey
## 92 Levels: Adam Sandler Alexander Skarsgård Amy Adams ... Zoey Deutch

length(actors_data)

## [1] 100

Scrape For Movies RAting Infomation

Looking at the ratings bar

#Using Gadget selectors to scrape the Movie rating section
ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>%
  html_text2()

#Converting the rating votes to text
ratings_data <- html_text(actors_data_html)

#Checking rating
head(ratings_data)

## [1] "Denzel Washington"   "Emilia Clarke"       "Felicity Jones"     
## [4] "Taraji P. Henson"    "Will Smith"          "Matthew McConaughey"

#Data-Preprocessing: converting ratings to numerical
ratings_data<-as.numeric(rating_data)

#Double - checking actors
head(ratings_data)

## [1] 6.8 7.4 7.8 7.8 5.9 7.1

length(ratings_data)

## [1] 100

Find metascore data with missing values, replace with NAS and coverting metascaore to numerical

metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%
  str_match("\\d{2}") %>%
  as.numeric()

#Data-Preprocessing: converting metascore to numerical
metascore_data<-as.numeric(metascore_data)

#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)

#Lets check the length of metascore data
head(metascore_data)

## [1] "54" "51" "65" "74" "40" "59"

length(metascore_data)

## [1] 100

summary(metascore_data)

##    Length     Class      Mode 
##       100 character character

Find the missing gross earnings (automated) Earnings are part of the votes bar in the html, scrape the votes bar and extract earnings with a regular expression to get the NAs in context.

# scrape the votes bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>% 
 html_text2()

head(votes_bar_data) # look at the votes bar data

## [1] "Votes: 217,181 | Gross: $93.43M"  "Votes: 263,328 | Gross: $56.25M" 
## [3] "Votes: 652,054 | Gross: $532.18M" "Votes: 238,331 | Gross: $169.61M"
## [5] "Votes: 695,556 | Gross: $325.10M" "Votes: 176,678 | Gross: $270.40M"

gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings

gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign 

gross_data <- substring(gross_data,2,6) %>% 
 as.numeric()

gross_data<-gsub("$","",gross_data) # clean data: remove '$' sign 

length(gross_data)

## [1] 100

Combining all the lists to form a data frame

movies2016 <- data.frame(Rank=rank_data,Title=title_data,Description=description_data,Runtime=runtime_data,Genre=genre_data,Rating=rating_data,Votes=votes_data,Director=directors_data,Actors=actors_data,Meta=metascore_data,Gross=gross_data)
glimpse(movies2016)

## Rows: 100
## Columns: 11
## $ Rank        <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Title       <chr> "The Magnificent Seven", "Me Before You", "Rogue One: A St…
## $ Description <chr> "Seven gunmen from a variety of backgrounds are brought to…
## $ Runtime     <dbl> 132, 106, 133, 127, 123, 108, 128, 108, 139, 116, 107, 108…
## $ Genre       <fct> Action, Drama, Action, Biography, Action, Animation, Comed…
## $ Rating      <dbl> 6.8, 7.4, 7.8, 7.8, 5.9, 7.1, 8.0, 8.0, 8.1, 7.9, 7.6, 8.0…
## $ Votes       <dbl> 217181, 263328, 652054, 238331, 695556, 176678, 608289, 10…
## $ Director    <fct> Antoine Fuqua, Thea Sharrock, Gareth Edwards, Theodore Mel…
## $ Actors      <fct> Denzel Washington, Emilia Clarke, Felicity Jones, Taraji P…
## $ Meta        <chr> "54", "51", "65", "74", "40", "59", "94", "65", "71", "81"…
## $ Gross       <chr> "93.43", "56.25", "532.1", "169.6", "325.1", "270.4", "151…

First Visualization

ggplot(movies2016,aes(x = Rank, y = Runtime)) +
  geom_point() +
  theme_classic() +
  ggtitle("The Estimate Time 
          & Rank of The 100 Movies of 2016" )

Second Visualization

ggplot(movies2016,aes(x = Rank, y = Genre)) +
  geom_point(aes(colour=Rank)) +
  theme_classic() +
  ggtitle("The Lead Movies Genre of 2016")

Filtering Infomation

Rank2 <- movies2016 %>%
  filter(Rank < 6)

Installing more Packages

library(devtools)

## Loading required package: usethis

## Warning: package 'usethis' was built under R version 4.2.3

library(highcharter)

## Warning: package 'highcharter' was built under R version 4.2.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

Third Visualization

p1 <- Rank2 %>% 
#establish plot and aestetics 
  hchart('scatter', hcaes(x = Runtime, y = Rank, group = Title)) %>%
#establish color 
    hc_colorAxis() %>%
#establish font for chart
hc_chart(style = list(fontFamily = "NewCenturySchoolbook",
                        fontWeight = "bold")) %>%
#establish title on x  and y axis
      hc_xAxis(title = list(text="Movies Duration")) %>%
      hc_yAxis(title = list(text="Rank"))%>%
#establish title and subtitle
   hc_title( text = "Top 5 Movies of 2016") %>% 
      hc_subtitle(text = "and How Long It's Take to Watch Them") %>%
#establish theme 
   hc_add_theme(hc_theme_gridlight()) %>%
  hc_tooltip(shared = TRUE)
p1

Question 1: Based on the above data, which movie from which Genre had the longest runtime?

Filtering the runtime from 140 mins to 160 mins

longestruntime <- movies2016 %>%
  filter(Runtime > 140 & Runtime < 160)

Plot2 Question1

plot2 <- longestruntime %>%
  ggplot(aes(x = Runtime, y = Genre, label = 'Title')) +
  geom_point(aes(color= Title)) +
  theme_classic() +
  ggtitle("The Longest Runtime Genre")
plot2

Answer question1: The movie “The Wailing” from which Genre “Drama had the longest runtime 156.

Question 2: Based on the above data, in Runtine of 130 - 160mins, which genre has the highest votes?

highestvotes <- movies2016 %>%
  filter(Runtime > 130 & Runtime < 160)

Plot3 Question 2

plot3 <- highestvotes %>%
  ggplot(aes(x = Votes, y = Genre, label = 'Title')) +
  geom_point(aes(color= Title)) +
  theme_classic() +
  ggtitle("The Highest Voted Movie")
plot3

Answer Question2: The genre that has the highest votes was action with “Captain America: Civil War.”

Question 3: Based on the above data, across all genres which genre has the hightest average gross earnings in runtime 100 to 120?

Filtering Runtime 100 to 120

highgross <- movies2016 %>%
  filter(Runtime > 100 & Runtime < 120)

Filtering the Avarage Gross Earnins

gross2 <- highgross %>%
  filter(Gross > 200.00)

Plot4 Question 3

plot4 <- gross2 %>%
  ggplot(aes(x = Gross, y = Genre)) +
  xlab("Gross Avarage per 100.00") +
  ylab("Genre") +
  geom_point() +
  theme_classic() +
  ggtitle("The Highest Average Gross 
          Earning in Runtime 100 to 120")
plot4

Answer Question3: Across all genres “Drama” seems to have the highest average gross earnings in runtime 100 to 120.