library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
#Reading the HTML code from the website
webpage <- read_html(url)
# Using CSS selectors to scrape the rankings slection
rank_data_html <- html_nodes(webpage,'.text-primary')
# Convert the ranking data to text
rank_data <- html_text(rank_data_html)
# Look at the rankings
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
# Convert rankings to numerical
rank_data <- as.numeric(rank_data)
# One more look at the rankings
head(rank_data)
## [1] 1 2 3 4 5 6
# Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage, '.lister-item-header a')
# Convert title data to text
title_data <- html_text(title_data_html)
# Look at the titles
head(title_data)
## [1] "Deadpool" "Suicide Squad"
## [3] "Rogue One: A Star Wars Story" "Split"
## [5] "Marauders" "Office Christmas Party"
# CSS selector to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
# Convert data to text
description_data <- html_text(description_data_html)
head(description_data)
## [1] "\n A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."
## [2] "\n A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [3] "\n The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the plans for the Death Star."
## [4] "\n Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."
## [5] "\n When a bank is hit by a brutal heist, all evidence points to the owner and his high-powered clients. But as a group of FBI agents dig deeper into the case - and the deadly heists continue - it becomes clear that a larger conspiracy is at play."
## [6] "\n When his uptight CEO sister threatens to shut down his branch, the branch manager throws an epic Christmas party in order to land a big client and save the day, but the party gets way out of hand..."
# Removing '\n'
description_data<-gsub("\n","",description_data)
#Look at the description data again
head(description_data)
## [1] " A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."
## [2] " A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [3] " The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the plans for the Death Star."
## [4] " Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."
## [5] " When a bank is hit by a brutal heist, all evidence points to the owner and his high-powered clients. But as a group of FBI agents dig deeper into the case - and the deadly heists continue - it becomes clear that a larger conspiracy is at play."
## [6] " When his uptight CEO sister threatens to shut down his branch, the branch manager throws an epic Christmas party in order to land a big client and save the day, but the party gets way out of hand..."
# Use CSS selector to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
# Convert data to text
runtime_data <- html_text(runtime_data_html)
head(runtime_data)
## [1] "108 min" "123 min" "133 min" "117 min" "107 min" "105 min"
# Remove mins and convert to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
# Look at data
head(runtime_data)
## [1] 108 123 133 117 107 105
# Using CSS selector to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')
# Convert to text
genre_data <- html_text(genre_data_html)
head(genre_data)
## [1] "\nAction, Adventure, Comedy "
## [2] "\nAction, Adventure, Fantasy "
## [3] "\nAction, Adventure, Sci-Fi "
## [4] "\nHorror, Thriller "
## [5] "\nAction, Crime, Thriller "
## [6] "\nComedy "
#Removing \n and removing excess spaces
genre_data<-gsub("\n","",genre_data)
genre_data<-gsub(" ","",genre_data)
# Taking only the first genere of each movie
genre_data<-gsub(",.*","",genre_data)
# Convert from text to factor
genre_data<-as.factor(genre_data)
#Look at data
head(genre_data)
## [1] Action Action Action Horror Action Comedy
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
# Using CSS selctor to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
# Convert ratings data to text
rating_data <- html_text(rating_data_html)
head(rating_data)
## [1] "8.0" "6.0" "7.8" "7.3" "5.5" "5.8"
# Convert ratings to numerical
rating_data<-as.numeric(rating_data)
#Look at data
head(rating_data)
## [1] 8.0 6.0 7.8 7.3 5.5 5.8
# Use CSS selector to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
# Convert data to text
votes_data <- html_text(votes_data_html)
head(votes_data)
## [1] "894,232" "594,545" "547,528" "417,569" "19,271" "67,902"
# Remove the commas
votes_data<-gsub(",","",votes_data)
# Convert votes to numerical
votes_data<-as.numeric(votes_data)
# Look at the data
head(votes_data)
## [1] 894232 594545 547528 417569 19271 67902
# Use CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
# Convert directors data to text
directors_data <- html_text(directors_data_html)
head(directors_data)
## [1] "Tim Miller" "David Ayer" "Gareth Edwards"
## [4] "M. Night Shyamalan" "Steven C. Miller" "Josh Gordon"
# Convert directors data into factors
directors_data<-as.factor(directors_data)
# Look at data
head(directors_data)
## [1] Tim Miller David Ayer Gareth Edwards M. Night Shyamalan
## [5] Steven C. Miller Josh Gordon
## 98 Levels: Alex Proyas Ana Lily Amirpour André Øvredal ... Zack Snyder
# Use CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
# Convert actors data into text
actors_data <- html_text(actors_data_html)
# Convert actors data into factors
actors_data<-as.factor(actors_data)
# Look at data
head(actors_data)
## [1] Ryan Reynolds Will Smith Felicity Jones James McAvoy Bruce Willis
## [6] Jason Bateman
## 90 Levels: Aamir Khan Alexander Skarsgård Amy Adams ... Zoey Deutch
#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')
#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
#Let's have a look at the metascore data
head(metascore_data)
## [1] "65 " "40 " "65 " "62 " "42 "
## [6] "42 "
#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)
#Lets check the length of metascore data
length(metascore_data)
## [1] 98
for (i in c(33,58,74,89)){
a<-metascore_data[1:(i-1)]
b<-metascore_data[i:length(metascore_data)]
metascore_data<-append(a,list("NA"))
metascore_data<-append(metascore_data,b)
}
#Data-Preprocessing: converting metascore to numerical
metascore_data<-as.numeric(metascore_data)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
length(metascore_data)
## [1] 102
metascore_data <- metascore_data[-c(101,102)]
summary(metascore_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.0 43.5 60.0 58.2 72.0 99.0 4
# Use CSS selector to scrape gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
# Convert gross revenue data to text
gross_data <- html_text(gross_data_html)
head(gross_data)
## [1] "$363.07M" "$325.10M" "$532.18M" "$138.29M" "$54.77M" "$248.76M"
# Removing '$' and 'M' signs
gross_data<-gsub("M","",gross_data)
gross_data<-substring(gross_data,2,6)
# Length of gross data
length(gross_data)
## [1] 91
#Filling missing entries with NA
for (i in c(6,11,33,53,58,80,85,86,88, 94)){
a<-gross_data[1:(i-1)]
b<-gross_data[i:length(gross_data)]
gross_data<-append(a,list("NA"))
gross_data<-append(gross_data,b)
}
#Data-Preprocessing: converting gross to numerical
gross_data<-as.numeric(gross_data)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
length(gross_data)
## [1] 101
gross_data <- gross_data[-c(101)]
summary(gross_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02 20.97 53.75 92.32 110.67 532.10 10
#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data,
Description = description_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data,
Director = directors_data, Actor = actors_data)
#Structure of the data frame
str(movies_df)
## 'data.frame': 100 obs. of 11 variables:
## $ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "Deadpool" "Suicide Squad" "Rogue One: A Star Wars Story" "Split" ...
## $ Description : chr " A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the"| __truncated__ " A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defens"| __truncated__ " The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the plans for the Death Star." " Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape befo"| __truncated__ ...
## $ Runtime : num 108 123 133 117 107 105 107 115 116 139 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 1 1 1 8 1 5 3 1 7 4 ...
## $ Rating : num 8 6 7.8 7.3 5.5 5.8 7.6 7.5 7.9 8.1 ...
## $ Metascore : num 65 40 65 62 42 42 81 72 81 71 ...
## $ Votes : num 894232 594545 547528 417569 19271 ...
## $ Gross_Earning_in_Mil: num 363 325.1 532.1 138.2 54.8 ...
## $ Director : Factor w/ 98 levels "Alex Proyas",..: 94 24 34 61 88 54 83 86 29 66 ...
## $ Actor : Factor w/ 90 levels "Aamir Khan","Alexander Skarsgård",..: 73 86 32 37 14 38 6 8 3 4 ...
head(movies_df)
## Rank Title
## 1 1 Deadpool
## 2 2 Suicide Squad
## 3 3 Rogue One: A Star Wars Story
## 4 4 Split
## 5 5 Marauders
## 6 6 Office Christmas Party
## Description
## 1 A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks.
## 2 A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse.
## 3 The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the plans for the Death Star.
## 4 Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th.
## 5 When a bank is hit by a brutal heist, all evidence points to the owner and his high-powered clients. But as a group of FBI agents dig deeper into the case - and the deadly heists continue - it becomes clear that a larger conspiracy is at play.
## 6 When his uptight CEO sister threatens to shut down his branch, the branch manager throws an epic Christmas party in order to land a big client and save the day, but the party gets way out of hand...
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil
## 1 108 Action 8.0 65 894232 363.00
## 2 123 Action 6.0 40 594545 325.10
## 3 133 Action 7.8 65 547528 532.10
## 4 117 Horror 7.3 62 417569 138.20
## 5 107 Action 5.5 42 19271 54.77
## 6 105 Comedy 5.8 42 67902 NA
## Director Actor
## 1 Tim Miller Ryan Reynolds
## 2 David Ayer Will Smith
## 3 Gareth Edwards Felicity Jones
## 4 M. Night Shyamalan James McAvoy
## 5 Steven C. Miller Bruce Willis
## 6 Josh Gordon Jason Bateman
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)
q1 <-movies_df %>% select(Title, Rank, Title, Runtime, Genre) %>%
filter(Runtime == max(Runtime))
q1
## Title Rank Runtime Genre
## 1 Silence 57 161 Drama
## 2 Dangal 71 161 Action
ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))
q2 <- movies_df %>% select(Title,Rank, Runtime, Votes, Genre) %>% filter(between(Runtime, 130, 160))
q2_plot <- q2 %>% ggplot(aes(x=Genre, y= Votes)) +
geom_bar(stat='identity') +
xlab("Movie Genre") +
ylab("Votes") +
ggtitle("Movie Genres by Vote") +
coord_flip()
q2_plot
q2_df <- q2 %>% filter(Votes == max(Votes))
q2_df
## Title Rank Runtime Votes Genre
## 1 Captain America: Civil War 13 147 655335 Action
ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))
## Warning: Removed 10 rows containing missing values (geom_point).
#Notes:
From the visualizations we are able to see that action movies had the highest voting rate.
Drama and action movies ad the highest run times. It is surprising how many people watch action moves as i would presume people would want to want a comedy or dramatic movies to just sit back and end the day or start a day.