library(rvest)
## Loading required package: xml2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.4
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)
rank_data_html <- html_nodes(webpage,'.text-primary')
rank_data <- html_text(rank_data_html)
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
rank_data<-as.numeric(rank_data)
head(rank_data)
## [1] 1 2 3 4 5 6
title_data_html <- html_nodes(webpage,'.lister-item-header a')
title_data <- html_text(title_data_html)
head(title_data)
## [1] "Batman v Superman: Dawn of Justice" "Suicide Squad"
## [3] "Captain America: Civil War" "Captain Fantastic"
## [5] "Deadpool" "Nocturnal Animals"
title_data
## [1] "Batman v Superman: Dawn of Justice"
## [2] "Suicide Squad"
## [3] "Captain America: Civil War"
## [4] "Captain Fantastic"
## [5] "Deadpool"
## [6] "Nocturnal Animals"
## [7] "Doctor Strange"
## [8] "Arrival"
## [9] "Moana"
## [10] "Hidden Figures"
## [11] "Split"
## [12] "X-Men: Apocalypse"
## [13] "The Nice Guys"
## [14] "The Invisible Guest"
## [15] "Rogue One: A Star Wars Story"
## [16] "La La Land"
## [17] "Zootopia"
## [18] "Miss Peregrine's Home for Peculiar Children"
## [19] "Hacksaw Ridge"
## [20] "The Handmaiden"
## [21] "The Love Witch"
## [22] "Your Name."
## [23] "War Dogs"
## [24] "Gods of Egypt"
## [25] "Manchester by the Sea"
## [26] "The Legend of Tarzan"
## [27] "Shin Godzilla"
## [28] "Star Trek Beyond"
## [29] "Me Before You"
## [30] "Fantastic Beasts and Where to Find Them"
## [31] "Moonlight"
## [32] "Sing"
## [33] "Hunt for the Wilderpeople"
## [34] "Train to Busan"
## [35] "The Girl on the Train"
## [36] "Passengers"
## [37] "Don't Breathe"
## [38] "Lion"
## [39] "The Accountant"
## [40] "13 Hours"
## [41] "The Magnificent Seven"
## [42] "The Conjuring 2"
## [43] "Jason Bourne"
## [44] "The BFG"
## [45] "A Cure for Wellness"
## [46] "Warcraft"
## [47] "Now You See Me 2"
## [48] "Live by Night"
## [49] "Hell or High Water"
## [50] "Dirty Grandpa"
## [51] "Independence Day: Resurgence"
## [52] "10 Cloverfield Lane"
## [53] "The Neon Demon"
## [54] "The 5th Wave"
## [55] "Assassin's Creed"
## [56] "Mike and Dave Need Wedding Dates"
## [57] "Dangal"
## [58] "A Silent Voice: The Movie"
## [59] "The Lost City of Z"
## [60] "Keeping Up with the Joneses"
## [61] "The Founder"
## [62] "Silence"
## [63] "Inferno"
## [64] "Message from the King"
## [65] "Bad Moms"
## [66] "Ghostbusters: Answer the Call"
## [67] "Sausage Party"
## [68] "Allegiant"
## [69] "London Has Fallen"
## [70] "The Jungle Book"
## [71] "The Great Wall"
## [72] "Trolls"
## [73] "Pride and Prejudice and Zombies"
## [74] "The Shallows"
## [75] "Below Her Mouth"
## [76] "Everybody Wants Some!!"
## [77] "Jack Reacher: Never Go Back"
## [78] "The Choice"
## [79] "Snowden"
## [80] "Criminal"
## [81] "The Edge of Seventeen"
## [82] "Hush"
## [83] "Miss Sloane"
## [84] "Deepwater Horizon"
## [85] "Allied"
## [86] "Finding Dory"
## [87] "Patriots Day"
## [88] "Colossal"
## [89] "The Infiltrator"
## [90] "The Fundamentals of Caring"
## [91] "Mother's Day"
## [92] "Sully"
## [93] "The Bad Batch"
## [94] "The Wailing"
## [95] "Triple 9"
## [96] "Toni Erdmann"
## [97] "Swiss Army Man"
## [98] "The Boy"
## [99] "Underworld: Blood Wars"
## [100] "Hail, Caesar!"
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
description_data <- html_text(description_data_html)
head(description_data)
## [1] "\n Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."
## [2] "\n A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [3] "\n Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man."
## [4] "\n In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical and intellectual education is forced to leave his paradise and enter the world, challenging his idea of what it means to be a parent."
## [5] "\n A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."
## [6] "\n A wealthy art gallery owner is haunted by her ex-husband's novel, a violent thriller she interprets as a symbolic revenge tale."
description_data<-gsub("\n","",description_data)
head(description_data)
## [1] " Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."
## [2] " A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [3] " Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man."
## [4] " In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical and intellectual education is forced to leave his paradise and enter the world, challenging his idea of what it means to be a parent."
## [5] " A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."
## [6] " A wealthy art gallery owner is haunted by her ex-husband's novel, a violent thriller she interprets as a symbolic revenge tale."
length(description_data)
## [1] 100
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
runtime_data <- html_text(runtime_data_html)
head(runtime_data)
## [1] "152 min" "123 min" "147 min" "118 min" "108 min" "116 min"
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
head(runtime_data)
## [1] 152 123 147 118 108 116
length(runtime_data)
## [1] 100
genre_data_html <- html_nodes(webpage,'.genre')
genre_data <- html_text(genre_data_html)
head(genre_data)
## [1] "\nAction, Adventure, Sci-Fi "
## [2] "\nAction, Adventure, Fantasy "
## [3] "\nAction, Adventure, Sci-Fi "
## [4] "\nComedy, Drama "
## [5] "\nAction, Adventure, Comedy "
## [6] "\nDrama, Thriller "
genre_data<-gsub("\n","",genre_data)
#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)
#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)
#Convering each genre from text to factor
genre_data<-as.factor(genre_data)
#Let's have another look at the genre data
head(genre_data)
## [1] Action Action Action Comedy Action Drama
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
length(genre_data)
## [1] 100
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
#Converting the ratings data to text
rating_data <- html_text(rating_data_html)
#Let's have a look at the ratings
head(rating_data)
## [1] "6.4" "6.0" "7.8" "7.9" "8.0" "7.5"
rating_data<-as.numeric(rating_data)
#Let's have another look at the ratings data
head(rating_data)
## [1] 6.4 6.0 7.8 7.9 8.0 7.5
length(rating_data)
## [1] 100
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
#Converting the votes data to text
votes_data <- html_text(votes_data_html)
#Let's have a look at the votes data
head(votes_data)
## [1] "641,742" "611,126" "675,299" "194,191" "913,147" "242,798"
votes_data<-gsub(",","",votes_data)
#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)
#Let's have another look at the votes data
head(votes_data)
## [1] 641742 611126 675299 194191 913147 242798
length(votes_data)
## [1] 100
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
#Converting the directors data to text
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)
## [1] "Zack Snyder" "David Ayer" "Anthony Russo" "Matt Ross"
## [5] "Tim Miller" "Tom Ford"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)
length(directors_data)
## [1] 100
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
#Let's have a look at the actors data
head(actors_data)
## [1] "Ben Affleck" "Will Smith" "Chris Evans" "Viggo Mortensen"
## [5] "Ryan Reynolds" "Amy Adams"
actors_data<-as.factor(actors_data)
length(actors_data)
## [1] 100
#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')
#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
#Let's have a look at the metascore
head(metascore_data)
## [1] "44 " "40 " "75 " "72 " "65 "
## [6] "67 "
#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)
#Lets check the length of metascore data
length(metascore_data)
## [1] 98
for (i in c(14,57)){
a<-metascore_data[1:(i-1)]
b<-metascore_data[i:length(metascore_data)]
metascore_data<-append(a,list("NA"))
metascore_data<-append(metascore_data,b)
}
#Data-Preprocessing: converting metascore to numerical
metascore_data<-as.numeric(metascore_data)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
#Let's have another look at length of the metascore data
length(metascore_data)
## [1] 100
#Let's look at summary statistics
summary(metascore_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.00 47.00 63.00 59.91 72.00 99.00 2
#Using CSS selectors to scrape the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
#Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
#Let's have a look at the votes data
head(gross_data)
## [1] "$330.36M" "$325.10M" "$408.08M" "$5.88M" "$363.07M" "$10.64M"
#Data-Preprocessing: removing '$' and 'M' signs
gross_data<-gsub("M","",gross_data)
gross_data<-substring(gross_data,2,6)
#Let's check the length of gross data
length(gross_data)
## [1] 93
#Filling missing entries with NA
for (i in c(14, 58, 64, 75, 82, 90, 94)){
a <- gross_data[1:(i-1)]
b <- gross_data[i:length(gross_data)]
gross_data <- append(a, -1) # used -1 in place of NA's
gross_data <- append(gross_data, b)
}
gross_data <- na.exclude(gross_data)
gross_data <- gross_data[-c(101)]
gross_data <- as.numeric(gross_data)
#Let's have another look at the length of gross data
length(gross_data)
## [1] 100
summary(gross_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.00 10.57 44.28 83.15 100.97 532.10
movies_df<-data.frame(Rank = rank_data, Title = title_data,
Description = description_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data,
Director = directors_data, Actor = actors_data)
#Structure of the data frame
str(movies_df)
## 'data.frame': 100 obs. of 11 variables:
## $ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "Batman v Superman: Dawn of Justice" "Suicide Squad" "Captain America: Civil War" "Captain Fantastic" ...
## $ Description : chr " Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world "| __truncated__ " A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defens"| __truncated__ " Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man." " In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical "| __truncated__ ...
## $ Runtime : num 152 123 147 118 108 116 115 116 107 127 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 1 1 1 5 1 7 1 7 3 4 ...
## $ Rating : num 6.4 6 7.8 7.9 8 7.5 7.5 7.9 7.6 7.8 ...
## $ Metascore : num 44 40 75 72 65 67 72 81 81 74 ...
## $ Votes : num 641742 611126 675299 194191 913147 ...
## $ Gross_Earning_in_Mil: num 330.3 325.1 408 5.88 363 ...
## $ Director : Factor w/ 98 levels "Alex Proyas",..: 98 23 6 61 93 95 85 26 81 91 ...
## $ Actor : Factor w/ 90 levels "Aamir Khan","Alexander Skarsgård",..: 8 88 20 87 73 3 9 3 7 82 ...
library('ggplot2')
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)

Question 1: Based on the above data, which movie from which Genre had the longest runtime?
Toni Erdmann from the Comedy genre had the longest runtine
movies_df %>%
rownames_to_column(var = "Name") %>%
filter(Runtime == max(Runtime))
## Name Rank Title
## 1 96 96 Toni Erdmann
## Description
## 1 A practical joking father tries to reconnect with his hard working daughter by creating an outrageous alter ego and posing as her CEO's life coach.
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil Director
## 1 162 Comedy 7.4 93 44824 1.48 Maren Ade
## Actor
## 1 Sandra Hüller
ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))

Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?
Captain America: Civil War was the movie with the highest vote in 2016 (Action genre)
df1 <- movies_df %>%
rownames_to_column(var = "Name") %>%
filter(Runtime >= 130) %>%
filter(Runtime <= 160) %>%
filter(Votes == max(Votes))
df1
## Name Rank Title
## 1 3 3 Captain America: Civil War
## Description
## 1 Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man.
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil Director
## 1 147 Action 7.8 75 675299 408 Anthony Russo
## Actor
## 1 Chris Evans
#filter(Runtime == c(130,160)) %>%
#filter(Votes == max(Votes))
ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.
Animation had the highest average gross earnings in 2016 at 216.33 million
dfq3 <-movies_df %>%
rownames_to_column(var = "Name") %>%
filter(Runtime >= 100) %>%
filter(Runtime <= 120) %>%
group_by(Genre) %>%
summarize(averageGross = mean(Gross_Earning_in_Mil)) %>%
filter(averageGross == max(averageGross))
dfq3
## # A tibble: 1 x 2
## Genre averageGross
## <fct> <dbl>
## 1 Animation 216.