library("tidyverse")
## Warning: package 'tidyverse' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library("rvest")
## Warning: package 'rvest' was built under R version 4.2.3
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library("ggplot2")
#Specifying the url for desired website to be scraped
url <- 'https://www.imdb.com/search/title/?count=100&release_date=2016,2016&title_type=feature'
#Reading the HTML code from the website
webpage <- read_html(url)
Use the common, length to ensure that each list contains 100 elements or NAS for missing data to sum to 100 elements.
#Using Gadget selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage, '.text-primary')
#Converting the ranking data to text
rank_data <- html_text(rank_data_html)
#Checking the rankings
head('rank_data')
## [1] "rank_data"
#Data-Preprocessing: Converting rankings to numerical
rank_data <- as.numeric(rank_data)
#Double - checking the rank
head(rank_data)
## [1] 1 2 3 4 5 6
length(rank_data)
## [1] 100
#Using Gadget selectors to scrape the title section
title_data_html <- html_nodes(webpage, '.lister-item-header a')
#Converting the title data to text
title_data <- html_text(title_data_html)
#Checking the title
head(title_data)
## [1] "The Magnificent Seven" "Me Before You"
## [3] "Rogue One: A Star Wars Story" "Hidden Figures"
## [5] "Suicide Squad" "Sing"
length(title_data)
## [1] 100
#Using Gadget selectors to scrape the descripion section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#Checking the description
head(description_data)
## [1] "\nSeven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist."
## [2] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
## [3] "\nIn a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."
## [4] "\nThe story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."
## [5] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [6] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)
#Double - checking the description
head(description_data)
## [1] "Seven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist."
## [2] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
## [3] "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."
## [4] "The story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."
## [5] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [6] "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
length(description_data)
## [1] 100
#Using Gadget selectors to scrape the Movie runtime sectio
runtine_data_html <- html_nodes(webpage, '.text-muted .runtime')
#Converting the runtime data to text
runtime_data <- html_text(runtine_data_html)
#Checking the run time
head(runtime_data)
## [1] "132 min" "106 min" "133 min" "127 min" "123 min" "108 min"
#Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub("min", "",runtime_data)
runtime_data<-as.numeric(runtime_data)
#Double - checking the run time
head(runtime_data)
## [1] 132 106 133 127 123 108
length(runtime_data)
## [1] 100
#Using Gadget selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')
#Converting the genre data to text
genre_data <- html_text(genre_data_html)
#Checking the genre
head(genre_data)
## [1] "\nAction, Adventure, Western "
## [2] "\nDrama, Romance "
## [3] "\nAction, Adventure, Sci-Fi "
## [4] "\nBiography, Drama, History "
## [5] "\nAction, Adventure, Fantasy "
## [6] "\nAnimation, Comedy, Family "
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)
#Data-Preprocessing: removing excess spaces
genre_data<-gsub("","",genre_data)
#taking only the first genre of each movie
genre_data<-gsub(',.*',"",genre_data)
#Convering each genre from text to factor
genre_data<-as.factor(genre_data)
#double - checking genre
head(genre_data)
## [1] Action Drama Action Biography Action Animation
## 10 Levels: Action Adventure Animation Biography Comedy ... Horror
length(genre_data)
## [1] 100
This information changes as the webpage updates regularly
#Using Gadget selectors to scrape the Movie rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
#Converting the rating data to text
rating_data <- html_text(rating_data_html)
#Checking rating
head(rating_data)
## [1] "6.8" "7.4" "7.8" "7.8" "5.9" "7.1"
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)
#Double - checking rating
head(rating_data)
## [1] 6.8 7.4 7.8 7.8 5.9 7.1
length(rating_data)
## [1] 100
#Using Gadget selectors to scrape the Movie rating section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
#Converting the rating votes to text
votes_data <- html_text(votes_data_html)
#Checking votes
head(votes_data)
## [1] "217,181" "263,328" "652,054" "238,331" "695,556" "176,678"
#Data-Preprocessing: removing commas
votes_data<-gsub(',',"",votes_data)
#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)
#Double - checking votes
head(votes_data)
## [1] 217181 263328 652054 238331 695556 176678
length(votes_data)
## [1] 100
#Using Gadget selectors to scrape the Movie rating section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
#Converting the rating votes to text
directors_data <- html_text(directors_data_html)
#checking directors
head(directors_data)
## [1] "Antoine Fuqua" "Thea Sharrock" "Gareth Edwards" "Theodore Melfi"
## [5] "David Ayer" "Garth Jennings"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)
#Double - checking director
head(directors_data)
## [1] Antoine Fuqua Thea Sharrock Gareth Edwards Theodore Melfi David Ayer
## [6] Garth Jennings
## 99 Levels: Aisling Walsh Alessandro Carloni Alex Proyas ... Zack Snyder
length(directors_data)
## [1] 100
#Using Gadget selectors to scrape the Movie rating section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
#Converting the rating votes to text
actors_data <- html_text(actors_data_html)
#Checking actors
head(actors_data)
## [1] "Denzel Washington" "Emilia Clarke" "Felicity Jones"
## [4] "Taraji P. Henson" "Will Smith" "Matthew McConaughey"
#Data-Preprocessing: converting actors data into factors
actors_data <-as.factor(actors_data)
#Double - checking actors
head(actors_data)
## [1] Denzel Washington Emilia Clarke Felicity Jones
## [4] Taraji P. Henson Will Smith Matthew McConaughey
## 92 Levels: Adam Sandler Alexander Skarsgård Amy Adams ... Zoey Deutch
length(actors_data)
## [1] 100
#Using Gadget selectors to scrape the Movie rating section
ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>%
html_text2()
#Converting the rating votes to text
ratings_data <- html_text(actors_data_html)
#Checking rating
head(ratings_data)
## [1] "Denzel Washington" "Emilia Clarke" "Felicity Jones"
## [4] "Taraji P. Henson" "Will Smith" "Matthew McConaughey"
#Data-Preprocessing: converting ratings to numerical
ratings_data<-as.numeric(rating_data)
#Double - checking actors
head(ratings_data)
## [1] 6.8 7.4 7.8 7.8 5.9 7.1
length(ratings_data)
## [1] 100
metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%
str_match("\\d{2}") %>%
as.numeric()
#Data-Preprocessing: converting metascore to numerical
metascore_data<-as.numeric(metascore_data)
#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)
#Lets check the length of metascore data
head(metascore_data)
## [1] "54" "51" "65" "74" "40" "59"
length(metascore_data)
## [1] 100
summary(metascore_data)
## Length Class Mode
## 100 character character
# scrape the votes bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
html_text2()
head(votes_bar_data) # look at the votes bar data
## [1] "Votes: 217,181 | Gross: $93.43M" "Votes: 263,328 | Gross: $56.25M"
## [3] "Votes: 652,054 | Gross: $532.18M" "Votes: 238,331 | Gross: $169.61M"
## [5] "Votes: 695,556 | Gross: $325.10M" "Votes: 176,678 | Gross: $270.40M"
gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign
gross_data <- substring(gross_data,2,6) %>%
as.numeric()
gross_data<-gsub("$","",gross_data) # clean data: remove '$' sign
length(gross_data)
## [1] 100
movies2016 <- data.frame(Rank=rank_data,Title=title_data,Description=description_data,Runtime=runtime_data,Genre=genre_data,Rating=rating_data,Votes=votes_data,Director=directors_data,Actors=actors_data,Meta=metascore_data,Gross=gross_data)
glimpse(movies2016)
## Rows: 100
## Columns: 11
## $ Rank <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Title <chr> "The Magnificent Seven", "Me Before You", "Rogue One: A St…
## $ Description <chr> "Seven gunmen from a variety of backgrounds are brought to…
## $ Runtime <dbl> 132, 106, 133, 127, 123, 108, 128, 108, 139, 116, 107, 108…
## $ Genre <fct> Action, Drama, Action, Biography, Action, Animation, Comed…
## $ Rating <dbl> 6.8, 7.4, 7.8, 7.8, 5.9, 7.1, 8.0, 8.0, 8.1, 7.9, 7.6, 8.0…
## $ Votes <dbl> 217181, 263328, 652054, 238331, 695556, 176678, 608289, 10…
## $ Director <fct> Antoine Fuqua, Thea Sharrock, Gareth Edwards, Theodore Mel…
## $ Actors <fct> Denzel Washington, Emilia Clarke, Felicity Jones, Taraji P…
## $ Meta <chr> "54", "51", "65", "74", "40", "59", "94", "65", "71", "81"…
## $ Gross <chr> "93.43", "56.25", "532.1", "169.6", "325.1", "270.4", "151…
ggplot(movies2016,aes(x = Rank, y = Runtime)) +
geom_point() +
theme_classic() +
ggtitle("The Estimate Time
& Rank of The 100 Movies of 2016" )
ggplot(movies2016,aes(x = Rank, y = Genre)) +
geom_point(aes(colour=Rank)) +
theme_classic() +
ggtitle("The Lead Movies Genre of 2016")
Rank2 <- movies2016 %>%
filter(Rank < 6)
library(devtools)
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.2.3
library(highcharter)
## Warning: package 'highcharter' was built under R version 4.2.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
p1 <- Rank2 %>%
#establish plot and aestetics
hchart('scatter', hcaes(x = Runtime, y = Rank, group = Title)) %>%
#establish color
hc_colorAxis() %>%
#establish font for chart
hc_chart(style = list(fontFamily = "NewCenturySchoolbook",
fontWeight = "bold")) %>%
#establish title on x and y axis
hc_xAxis(title = list(text="Movies Duration")) %>%
hc_yAxis(title = list(text="Rank"))%>%
#establish title and subtitle
hc_title( text = "Top 5 Movies of 2016") %>%
hc_subtitle(text = "and How Long It's Take to Watch Them") %>%
#establish theme
hc_add_theme(hc_theme_gridlight()) %>%
hc_tooltip(shared = TRUE)
p1
longestruntime <- movies2016 %>%
filter(Runtime > 140 & Runtime < 160)
plot2 <- longestruntime %>%
ggplot(aes(x = Runtime, y = Genre, label = 'Title')) +
geom_point(aes(color= Title)) +
theme_classic() +
ggtitle("The Longest Runtime Genre")
plot2
Answer question1: The movie “The Wailing” from which Genre “Drama had the longest runtime 156.
highestvotes <- movies2016 %>%
filter(Runtime > 130 & Runtime < 160)
plot3 <- highestvotes %>%
ggplot(aes(x = Votes, y = Genre, label = 'Title')) +
geom_point(aes(color= Title)) +
theme_classic() +
ggtitle("The Highest Voted Movie")
plot3
Answer Question2: The genre that has the highest votes was action with “Captain America: Civil War.”
highgross <- movies2016 %>%
filter(Runtime > 100 & Runtime < 120)
gross2 <- highgross %>%
filter(Gross > 200.00)
plot4 <- gross2 %>%
ggplot(aes(x = Gross, y = Genre)) +
xlab("Gross Avarage per 100.00") +
ylab("Genre") +
geom_point() +
theme_classic() +
ggtitle("The Highest Average Gross
Earning in Runtime 100 to 120")
plot4
Answer Question3: Across all genres “Drama” seems to have the highest average gross earnings in runtime 100 to 120.