#Loading the rvest package
library(rvest)
## Warning: package 'rvest' was built under R version 4.0.4
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.4
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
#Reading the HTML code from the website
webpage <- read_html(url)
#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')
#Converting the ranking data to text
rank_data <- html_text(rank_data_html)
#Let's have a look at the rankings
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)
#Let's have another look at the rankings
head(rank_data)
## [1] 1 2 3 4 5 6
#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')
#Converting the title data to text
title_data <- html_text(title_data_html)
#Let's have a look at the title
head(title_data)
## [1] "Suicide Squad" "Batman v Superman: Dawn of Justice"
## [3] "Captain America: Civil War" "Captain Fantastic"
## [5] "Deadpool" "The Accountant"
#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#Let's have a look at the description data
head(description_data)
## [1] "\n A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [2] "\n Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."
## [3] "\n Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man."
## [4] "\n In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical and intellectual education is forced to leave his paradise and enter the world, challenging his idea of what it means to be a parent."
## [5] "\n A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."
## [6] "\n As a math savant uncooks the books for a new client, the Treasury Department closes in on his activities, and the body count starts to rise."
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)
#Let's have another look at the description data
head(description_data)
## [1] " A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [2] " Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."
## [3] " Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man."
## [4] " In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical and intellectual education is forced to leave his paradise and enter the world, challenging his idea of what it means to be a parent."
## [5] " A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."
## [6] " As a math savant uncooks the books for a new client, the Treasury Department closes in on his activities, and the body count starts to rise."
#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
#Let's have a look at the runtime
head(runtime_data)
## [1] "123 min" "152 min" "147 min" "118 min" "108 min" "128 min"
#Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 123 152 147 118 108 128
#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')
#Converting the genre data to text
genre_data <- html_text(genre_data_html)
#Let's have a look at the runtime
head(genre_data)
## [1] "\nAction, Adventure, Fantasy "
## [2] "\nAction, Adventure, Sci-Fi "
## [3] "\nAction, Adventure, Sci-Fi "
## [4] "\nComedy, Drama "
## [5] "\nAction, Adventure, Comedy "
## [6] "\nAction, Crime, Drama "
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)
#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)
#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)
#Convering each genre from text to factor
genre_data<-as.factor(genre_data)
#Let's have another look at the genre data
head(genre_data)
## [1] Action Action Action Comedy Action Action
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
#Converting the ratings data to text
rating_data <- html_text(rating_data_html)
#Let's have a look at the ratings
head(rating_data)
## [1] "6.0" "6.4" "7.8" "7.9" "8.0" "7.3"
length(rating_data)
## [1] 100
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)
#Let's have another look at the ratings data
head(rating_data)
## [1] 6.0 6.4 7.8 7.9 8.0 7.3
#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
#Converting the votes data to text
votes_data <- html_text(votes_data_html)
#Let's have a look at the votes data
head(votes_data)
## [1] "612,340" "643,328" "676,216" "194,590" "913,877" "264,414"
#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
#Converting the votes data to text
votes_data <- html_text(votes_data_html)
#Let's have a look at the votes data
head(votes_data)
## [1] "612,340" "643,328" "676,216" "194,590" "913,877" "264,414"
votes_data<-gsub(",","",votes_data)
votes_data <- as.numeric(votes_data)
votes_data
## [1] 612340 643328 676216 194590 913877 264414 23825 285493 278442 603986
## [11] 354639 609998 433137 444278 514306 243055 562899 148744 398164 164738
## [21] 439861 212841 114038 278206 135913 117762 25809 230379 186965 203037
## [31] 157481 204438 107646 415329 225924 171036 171750 183084 217142 251126
## [41] 83368 121679 42049 143451 243357 129011 228710 210415 101745 205964
## [51] 177775 206701 85543 168210 52343 161897 160035 293893 78611 101627
## [61] 69549 242031 39332 252850 115453 140902 51665 81742 187544 260757
## [71] 52238 52825 106499 58418 10673 59802 27159 88446 13414 99071
## [81] 73275 106344 62332 60464 161660 244045 14410 140067 110744 11486
## [91] 121477 146972 125421 153943 93680 41935 65540 53590 81526 14604
#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
#Converting the directors data to text
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)
## [1] "David Ayer" "Zack Snyder" "Anthony Russo" "Matt Ross"
## [5] "Tim Miller" "Gavin O'Connor"
length(directors_data)
## [1] 100
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)
#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
#Let's have a look at the actors data
head(actors_data)
## [1] "Will Smith" "Ben Affleck" "Chris Evans" "Viggo Mortensen"
## [5] "Ryan Reynolds" "Ben Affleck"
#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')
#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
#Let's have a look at the metascore data
head(metascore_data)
## [1] "40 " "44 " "75 " "72 " "65 "
## [6] "51 "
#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)
#Lets check the length of metascore data
length(metascore_data)
## [1] 97
for (i in c(18, 57, 100)){
metascore_data <- append(metascore_data, NA, i-1)
}
#Data-Preprocessing: converting metascore to numerical
as.numeric(metascore_data)
## [1] 40 44 75 72 65 51 67 70 81 81 41 72 62 71 94 67 65 NA 52 44 78 51 81 99 59
## [26] 84 51 68 54 79 57 74 25 66 71 72 48 57 69 96 51 48 38 60 32 66 65 58 79 60
## [51] 66 88 51 32 83 42 NA 76 66 33 66 74 48 46 60 47 78 78 36 77 45 49 67 34 42
## [76] 70 62 47 82 35 55 77 55 36 52 77 40 58 33 60 59 28 72 68 69 47 64 81 42 NA
#Let's have another look at length of the metascore data
length(metascore_data)
## [1] 100
#Using CSS selectors to scrape the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
#Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
#Let's have a look at the votes data
head(gross_data)
## [1] "$325.10M" "$330.36M" "$408.08M" "$5.88M" "$363.07M" "$86.26M"
#Data-Preprocessing: removing '$' and 'M' signs
gross_data<-gsub("M","",gross_data)
gross_data<-substring(gross_data,2,6)
gross_data
## [1] "325.1" "330.3" "408.0" "5.88" "363.0" "86.26" "1.91" "36.26" "248.7"
## [10] "100.5" "100.0" "232.6" "138.2" "67.21" "151.1" "10.64" "532.1" "155.4"
## [19] "126.6" "341.2" "56.25" "5.20" "27.85" "270.4" "2.01" "36.87" "158.8"
## [28] "93.43" "5.02" "87.24" "169.6" "31.15" "234.0" "89.22" "2.13" "75.40"
## [37] "43.03" "51.74" "47.70" "46.01" "52.85" "26.41" "40.10" "47.37" "12.79"
## [46] "102.4" "162.4" "7.10" "128.3" "97.69" "26.86" "1.33" "103.1" "3.37"
## [55] "34.34" "12.39" "72.08" "55.48" "34.92" "15.43" "125.0" "0.04" "65.08"
## [64] "113.2" "58.70" "8.58" "54.65" "364.0" "10.91" "10.38" "14.90" "3.02"
## [73] "0.18" "8.11" "0.23" "48.39" "153.7" "14.43" "14.27" "127.4" "486.3"
## [82] "21.59" "66.18" "0.61" "55.12" "62.68" "30.08" "61.43" "31.89" "20.78"
## [91] "3.44" "35.82"
#Let's check the length of gross data
length(gross_data)
## [1] 92
for (i in c(18, 67, 73, 75, 83, 87, 98, 100)){
gross_data <- append(gross_data, NA, i-1)
}
#Data-Preprocessing: converting gross to numerical
gross_data<-as.numeric(gross_data)
#Let's have another look at the length of gross data
length(gross_data)
## [1] 100
summary(gross_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.04 14.39 52.30 91.32 116.15 532.10 8
#Combining all the lists to form a data frame
movies_df<-data.frame(
Rank = rank_data,
Title = title_data,
Description = description_data,
Runtime = runtime_data,
Genre = genre_data,
Rating = rating_data,
Metascore = metascore_data,
Votes = votes_data,
Gross_Earning_in_Mil = gross_data,
Director = directors_data,
Actor = actors_data)
#Structure of the data frame
str(movies_df)
## 'data.frame': 100 obs. of 11 variables:
## $ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "Suicide Squad" "Batman v Superman: Dawn of Justice" "Captain America: Civil War" "Captain Fantastic" ...
## $ Description : chr " A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defens"| __truncated__ " Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world "| __truncated__ " Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man." " In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical "| __truncated__ ...
## $ Runtime : num 123 152 147 118 108 128 120 116 107 116 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 1 1 1 5 1 1 1 1 3 7 ...
## $ Rating : num 6 6.4 7.8 7.9 8 7.3 6.8 7.4 7.6 7.9 ...
## $ Metascore : chr "40" "44" "75" "72" ...
## $ Votes : num 612340 643328 676216 194590 913877 ...
## $ Gross_Earning_in_Mil: num 325.1 330.3 408 5.88 363 ...
## $ Director : Factor w/ 98 levels "Adam Wingard",..: 23 98 6 61 93 36 40 86 82 27 ...
## $ Actor : chr "Will Smith" "Ben Affleck" "Chris Evans" "Viggo Mortensen" ...
library('ggplot2')
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)
runtime <- movies_df %>%
filter(Runtime > 155)
runtime
## Rank Title
## 1 49 Silence
## 2 57 Dangal
## 3 98 The Wailing
## Description
## 1 In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is rumored to have committed apostasy, and to propagate Catholicism.
## 2 Former wrestler Mahavir Singh Phogat and his two wrestler daughters struggle towards glory at the Commonwealth Games in the face of societal oppression.
## 3 Soon after a stranger arrives in a little village, a mysterious sickness starts spreading. A policeman, drawn into the incident, is forced to solve the mystery in order to save his daughter.
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil Director
## 1 161 Drama 7.2 79 101745 7.10 Martin Scorsese
## 2 161 Action 8.4 <NA> 160035 12.39 Nitesh Tiwari
## 3 156 Horror 7.5 81 53590 NA Hong-jin Na
## Actor
## 1 Andrew Garfield
## 2 Aamir Khan
## 3 Jun Kunimura
ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))
movies_df2 <- movies_df %>%
filter(Runtime <= 160 & Runtime >= 130) %>%
pivot_wider(names_from = "Genre", values_from = "Votes")
movies_df2
## # A tibble: 18 x 15
## Rank Title Description Runtime Rating Metascore Gross_Earning_i~ Director
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl> <fct>
## 1 2 Batm~ " Feari~ 152 6.4 44 330. Zack Sn~
## 2 3 Capt~ " Polit~ 147 7.8 75 408 Anthony~
## 3 14 Hack~ " World~ 139 8.1 71 67.2 Mel Gib~
## 4 17 Rogu~ " The d~ 133 7.8 65 532. Gareth ~
## 5 19 X-Me~ " In th~ 144 6.9 52 155. Bryan S~
## 6 26 The ~ " A wom~ 145 8.1 84 2.01 Chan-wo~
## 7 29 The ~ " Seven~ 132 6.9 54 93.4 Antoine~
## 8 34 Fant~ " The a~ 132 7.3 66 234 David Y~
## 9 40 Manc~ " A dep~ 137 7.8 96 47.7 Kenneth~
## 10 42 13 H~ " Durin~ 144 7.3 48 52.8 Michael~
## 11 47 The ~ " Ed an~ 134 7.3 65 102. James W~
## 12 67 A Si~ " A you~ 130 8.1 78 NA Naoko Y~
## 13 68 The ~ " A tru~ 141 6.6 78 8.58 James G~
## 14 78 A Cu~ " An am~ 146 6.4 47 8.11 Gore Ve~
## 15 88 Snow~ " The N~ 134 7.3 58 21.6 Oliver ~
## 16 95 Patr~ " The s~ 133 7.4 69 31.9 Peter B~
## 17 97 Miss~ " In th~ 132 7.5 64 3.44 John Ma~
## 18 98 The ~ " Soon ~ 156 7.5 81 NA Hong-ji~
## # ... with 7 more variables: Actor <chr>, Action <dbl>, Biography <dbl>,
## # Drama <dbl>, Adventure <dbl>, Horror <dbl>, Animation <dbl>
colMeans(movies_df2[10:15], na.rm = TRUE)
## Action Biography Drama Adventure Horror Animation
## 383275.9 222029.0 130718.5 415329.0 141150.0 51665.0
ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))
## Warning: Removed 8 rows containing missing values (geom_point).
movies_df3 <- movies_df %>%
pivot_wider(names_from = "Genre", values_from = "Gross_Earning_in_Mil") %>%
filter(Runtime <= 120 & Runtime >=100)
movies_df3
## # A tibble: 50 x 17
## Rank Title Description Runtime Rating Metascore Votes Director Actor Action
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl> <fct> <chr> <dbl>
## 1 4 Capt~ " In th~ 118 7.9 72 194590 Matt Ro~ Vigg~ NA
## 2 5 Dead~ " A wis~ 108 8 65 913877 Tim Mil~ Ryan~ 363
## 3 7 Shin~ " Japan~ 120 6.8 67 23825 Hideaki~ Hiro~ 1.91
## 4 8 The ~ " In 19~ 116 7.4 70 285493 Shane B~ Russ~ 36.3
## 5 9 Moana " In An~ 107 7.6 81 278442 Ron Cle~ Auli~ NA
## 6 10 Arri~ " A lin~ 116 7.9 81 603986 Denis V~ Amy ~ NA
## 7 11 Pass~ " A mal~ 116 7 41 354639 Morten ~ Jenn~ NA
## 8 12 Doct~ " While~ 115 7.5 72 609998 Scott D~ Bene~ 233.
## 9 13 Split " Three~ 117 7.3 62 433137 M. Nigh~ Jame~ NA
## 10 16 Noct~ " A wea~ 116 7.5 67 243055 Tom Ford Amy ~ NA
## # ... with 40 more rows, and 7 more variables: Comedy <dbl>, Animation <dbl>,
## # Drama <dbl>, Horror <dbl>, Biography <dbl>, Crime <dbl>, Adventure <dbl>
colMeans(movies_df3[10:17], na.rm = TRUE)
## Action Comedy Animation Drama Horror Biography Crime Adventure
## 70.63739 27.86500 216.33000 49.30833 69.76500 35.85333 75.40000 141.56000
Silence and Dangal which are drama and action respectively have the same run time of 161 minutes.
Of movies that are between 130 and 160 minutes the adventure movies had the most average votes with 415,329 votes.
Of movies that were between 100 and 120 minutes, Adventure movies had the highest income of 141.56 million dollars.