Now, let’s get started with scraping the IMDb website for the 100 most popular feature films released in 2016. You can access them here.
library(stringr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ forcats 0.5.1
## ✓ readr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
#Loading the rvest package
library('rvest')
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
#Reading the HTML code from the website
webpage <- read_html(url)
#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')
#Converting the ranking data to text
rank_data <- html_text(rank_data_html)
#Let's have a look at the rankings
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)
#Let's have another look at the rankings
head(rank_data)
## [1] 1 2 3 4 5 6
#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)
#Let's have another look at the rankings
head(rank_data)
## [1] 1 2 3 4 5 6
#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')
#Converting the title data to text
title_data <- html_text(title_data_html)
#Let's have a look at the title
head(title_data)
## [1] "Suicide Squad" "The Conjuring 2" "Captain Fantastic"
## [4] "Sing" "Deadpool" "Hidden Figures"
#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#Let's have a look at the description data
head(description_data)
## [1] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [2] "\nEd and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit."
## [3] "\nIn the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical and intellectual education is forced to leave his paradise and enter the world, challenging his idea of what it means to be a parent."
## [4] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [5] "\nA wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."
## [6] "\nThe story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)
#Let's have another look at the description data
head(description_data)
## [1] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [2] "Ed and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit."
## [3] "In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical and intellectual education is forced to leave his paradise and enter the world, challenging his idea of what it means to be a parent."
## [4] "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
## [5] "A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."
## [6] "The story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."
#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
#Let's have a look at the runtime
head(runtime_data)
## [1] "123 min" "134 min" "118 min" "108 min" "108 min" "127 min"
#Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 123 134 118 108 108 127
#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')
#Converting the genre data to text
genre_data <- html_text(genre_data_html)
#Let's have a look at the runtime
head(genre_data)
## [1] "\nAction, Adventure, Fantasy "
## [2] "\nHorror, Mystery, Thriller "
## [3] "\nComedy, Drama "
## [4] "\nAnimation, Comedy, Family "
## [5] "\nAction, Adventure, Comedy "
## [6] "\nBiography, Drama, History "
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)
#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)
#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)
#Convering each genre from text to factor
genre_data<-as.factor(genre_data)
#Let's have another look at the genre data
head(genre_data)
## [1] Action Horror Comedy Animation Action Biography
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
#Converting the ratings data to text
rating_data <- html_text(rating_data_html)
#Let's have a look at the ratings
head(rating_data)
## [1] "5.9" "7.3" "7.9" "7.1" "8.0" "7.8"
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)
#Let's have another look at the ratings data
head(rating_data)
## [1] 5.9 7.3 7.9 7.1 8.0 7.8
#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
#Converting the votes data to text
votes_data <- html_text(votes_data_html)
#Let's have a look at the votes data
head(votes_data)
## [1] "622,782" "239,722" "199,906" "138,652" "928,628" "208,152"
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)
#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)
#Let's have another look at the votes data
head(votes_data)
## [1] 622782 239722 199906 138652 928628 208152
#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
#Converting the directors data to text
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)
## [1] "David Ayer" "James Wan" "Matt Ross" "Garth Jennings"
## [5] "Tim Miller" "Theodore Melfi"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)
#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
#Let's have a look at the actors data
head(actors_data)
## [1] "Will Smith" "Vera Farmiga" "Viggo Mortensen"
## [4] "Matthew McConaughey" "Ryan Reynolds" "Taraji P. Henson"
#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)
#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')
#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
#Let's have a look at the metascore
head(metascore_data)
## [1] "40 " "65 " "72 " "59 " "65 "
## [6] "74 "
#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)
#Lets check the length of metascore data
length(metascore_data)
## [1] 96
for (i in c(39,73,80,89)){
a<-metascore_data[1:(i-1)]
b<-metascore_data[i:length(metascore_data)]
metascore_data<-append(a,list("NA"))
metascore_data<-append(metascore_data,b)
}
#Data-Preprocessing: converting metascore to numerical
metascore_data<-as.numeric(metascore_data)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
#Let's have another look at length of the metascore data
length(metascore_data)
## [1] 100
#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')
#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
#Let's have a look at the metascore
head(metascore_data)
## [1] "40 " "65 " "72 " "59 " "65 "
## [6] "74 "
#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)
#Lets check the length of metascore data
length(metascore_data)
## [1] 96
#Using CSS selectors to scrape the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
#Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
#Let's have a look at the votes data
head(gross_data)
## [1] "$325.10M" "$102.47M" "$5.88M" "$270.40M" "$363.07M" "$169.61M"
#Data-Preprocessing: removing '$' and 'M' signs
gross_data<-gsub("M","",gross_data)
gross_data<-substring(gross_data,2,6)
#Let's check the length of gross data
length(gross_data)
## [1] 89
ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>% # scrape the ratings bar and convert to text
html_text2()
head(ratings_bar_data) # look at the ratings bar
## [1] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
## [2] "7.3\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.3/10 X \n65 Metascore"
## [3] "7.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.9/10 X \n72 Metascore"
## [4] "7.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.1/10 X \n59 Metascore"
## [5] "8.0\nRate this\n 1 2 3 4 5 6 7 8 9 10 8/10 X \n65 Metascore"
## [6] "7.8\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.8/10 X \n74 Metascore"
metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>% # extract Metascore
str_match("\\d{2}") %>%
as.numeric() # convert to number
length(metascore_data)
## [1] 100
metascore_data
## [1] 40 65 72 59 65 74 81 62 54 72 67 81 75 71 94 70 78 51 44 41 84 72 65 68 25
## [26] 79 71 51 66 51 48 52 99 NA 48 96 57 44 32 57 88 79 77 52 80 58 28 81 66 78
## [51] 81 32 76 66 42 60 62 33 51 67 52 81 46 NA 69 23 77 58 58 47 49 23 59 36 46
## [76] 60 78 42 39 55 49 NA 77 51 64 68 55 NA 65 72 74 35 26 40 42 66 34 36 33 55
summary(metascore_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 23.00 46.75 59.50 59.15 72.00 99.00 4
# scrape the votess bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
html_text2()
head(votes_bar_data) # look at the votes bar data
## [1] "Votes: 622,782 | Gross: $325.10M" "Votes: 239,722 | Gross: $102.47M"
## [3] "Votes: 199,906 | Gross: $5.88M" "Votes: 138,652 | Gross: $270.40M"
## [5] "Votes: 928,628 | Gross: $363.07M" "Votes: 208,152 | Gross: $169.61M"
gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign
gross_data <- substring(gross_data,2,6) %>% # clean data: remove '$' sign
as.numeric()
length(gross_data)
## [1] 100
#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data,
Description = description_data,
Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data,
Director = directors_data, Actor = actors_data)
#Structure of the data frame
str(movies_df)
## 'data.frame': 100 obs. of 11 variables:
## $ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "Suicide Squad" "The Conjuring 2" "Captain Fantastic" "Sing" ...
## $ Description : chr "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ "Ed and Lorraine Warren travel to North London to help a single mother raising four children alone in a house pl"| __truncated__ "In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical and "| __truncated__ "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing compe"| __truncated__ ...
## $ Runtime : num 123 134 118 108 108 127 107 117 132 115 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 1 8 5 3 1 4 3 8 1 1 ...
## $ Rating : num 5.9 7.3 7.9 7.1 8 7.8 7.6 7.3 6.9 7.5 ...
## $ Metascore : num 40 65 72 59 65 74 81 62 54 72 ...
## $ Votes : num 622782 239722 199906 138652 928628 ...
## $ Gross_Earning_in_Mil: num 325.1 102.4 5.88 270.4 363 ...
## $ Director : Factor w/ 99 levels "Alex Proyas",..: 23 42 59 35 95 93 83 56 8 87 ...
## $ Actor : Factor w/ 90 levels "Aamir Khan","Alexander Skarsgård",..: 88 86 87 59 73 81 6 39 22 8 ...
Once you have the data, you can perform several tasks like analyzing the data, drawing inferences from it, training machine learning models over this data, etc. I have gone on to create some interesting visualization out of the data we have just scraped. Follow the visualizations and answer the questions given below. Post your answers in the comment section below.
library('ggplot2')
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)
The genre of Adventure has the most extended runtime, which is 163 mins.
names(movies_df) <- tolower(names(movies_df))
names(movies_df) <- gsub(" ","",names(movies_df))
str(movies_df)
## 'data.frame': 100 obs. of 11 variables:
## $ rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ title : chr "Suicide Squad" "The Conjuring 2" "Captain Fantastic" "Sing" ...
## $ description : chr "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ "Ed and Lorraine Warren travel to North London to help a single mother raising four children alone in a house pl"| __truncated__ "In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical and "| __truncated__ "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing compe"| __truncated__ ...
## $ runtime : num 123 134 118 108 108 127 107 117 132 115 ...
## $ genre : Factor w/ 8 levels "Action","Adventure",..: 1 8 5 3 1 4 3 8 1 1 ...
## $ rating : num 5.9 7.3 7.9 7.1 8 7.8 7.6 7.3 6.9 7.5 ...
## $ metascore : num 40 65 72 59 65 74 81 62 54 72 ...
## $ votes : num 622782 239722 199906 138652 928628 ...
## $ gross_earning_in_mil: num 325.1 102.4 5.88 270.4 363 ...
## $ director : Factor w/ 99 levels "Alex Proyas",..: 23 42 59 35 95 93 83 56 8 87 ...
## $ actor : Factor w/ 90 levels "Aamir Khan","Alexander Skarsgård",..: 88 86 87 59 73 81 6 39 22 8 ...
a = movies_df %>%
filter(runtime == max(runtime))
a
## rank title
## 1 45 American Honey
## description
## 1 A teenage girl with nothing to lose joins a traveling magazine sales crew, and gets caught up in a whirlwind of hard partying, law bending and young love as she criss-crosses the Midwest with a band of misfits.
## runtime genre rating metascore votes gross_earning_in_mil director
## 1 163 Adventure 7 80 39829 0.66 Andrea Arnold
## actor
## 1 Sasha Lane
ggplot(movies_df,aes(x=runtime,y=rating))+
geom_point(aes(size=votes,col=genre))
In the Runtime of 130-160 mins, Action has the highest votes, which is 690516.
b = movies_df%>%
filter(runtime>=130 & runtime<=160)%>%
arrange(desc(votes))
b
## rank title
## 1 13 Captain America: Civil War
## 2 19 Batman v Superman: Dawn of Justice
## 3 23 Rogue One: A Star Wars Story
## 4 14 Hacksaw Ridge
## 5 29 Fantastic Beasts and Where to Find Them
## 6 32 X-Men: Apocalypse
## 7 36 Manchester by the Sea
## 8 2 The Conjuring 2
## 9 9 The Magnificent Seven
## 10 69 Snowden
## 11 31 13 Hours
## 12 21 The Handmaiden
## 13 77 The Lost City of Z
## 14 85 Miss Sloane
## 15 50 A Silent Voice: The Movie
## 16 62 The Wailing
## description
## 1 Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man.
## 2 Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs.
## 3 In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction.
## 4 World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot.
## 5 The adventures of writer Newt Scamander in New York's secret community of witches and wizards seventy years before Harry Potter reads his book in school.
## 6 In the 1980s the X-Men must defeat an ancient all-powerful mutant, En Sabah Nur, who intends to thrive through bringing destruction to the world.
## 7 A depressed uncle is asked to take care of his teenage nephew after the boy's father dies.
## 8 Ed and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit.
## 9 Seven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist.
## 10 The NSA's illegal surveillance techniques are leaked to the public by one of the agency's employees, Edward Snowden, in the form of thousands of classified documents distributed to the press.
## 11 During an attack on a U.S. compound in Libya, a security team struggles to make sense out of the chaos.
## 12 A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her.
## 13 A true-life drama, centering on British explorer Major Percival Fawcett, who disappeared whilst searching for a mysterious city in the Amazon in the 1920s.
## 14 In the high-stakes world of political power-brokers, Elizabeth Sloane is the most sought after and formidable lobbyist in D.C. But when taking on the most powerful opponent of her career, she finds winning may come at too high a price.
## 15 A young man is ostracized by his classmates after he bullies a deaf girl to the point where she moves away. Years later, he sets off on a path for redemption.
## 16 Soon after a stranger arrives in a little village, a mysterious sickness starts spreading. A policeman, drawn into the incident, is forced to solve the mystery in order to save his daughter.
## runtime genre rating metascore votes gross_earning_in_mil
## 1 147 Action 7.8 75 690516 408.00
## 2 152 Action 6.4 44 653739 330.30
## 3 133 Action 7.8 65 570605 532.10
## 4 139 Biography 8.1 71 455091 67.21
## 5 132 Adventure 7.3 66 421202 234.00
## 6 144 Action 6.9 52 403201 155.40
## 7 137 Drama 7.8 96 255992 47.70
## 8 134 Horror 7.3 65 239722 102.40
## 9 132 Action 6.9 54 189408 93.43
## 10 134 Biography 7.3 58 142449 21.59
## 11 144 Action 7.3 48 124834 52.85
## 12 145 Drama 8.1 84 122537 2.01
## 13 141 Biography 6.6 78 83224 8.58
## 14 132 Drama 7.5 64 67366 3.44
## 15 130 Animation 8.2 78 56950 NA
## 16 156 Horror 7.5 81 56377 NA
## director actor
## 1 Anthony Russo Chris Evans
## 2 Zack Snyder Ben Affleck
## 3 Gareth Edwards Felicity Jones
## 4 Mel Gibson Andrew Garfield
## 5 David Yates Eddie Redmayne
## 6 Bryan Singer James McAvoy
## 7 Kenneth Lonergan Casey Affleck
## 8 James Wan Vera Farmiga
## 9 Antoine Fuqua Denzel Washington
## 10 Oliver Stone Joseph Gordon-Levitt
## 11 Michael Bay John Krasinski
## 12 Park Chan-Wook Kim Min-hee
## 13 James Gray Charlie Hunnam
## 14 John Madden Jessica Chastain
## 15 Naoko Yamada Miyu Irino
## 16 Na Hong-jin Jun Kunimura
ggplot(movies_df,aes(x=runtime,y=gross_earning_in_mil))+
geom_point(aes(size=rating,col=genre))
## Warning: Removed 11 rows containing missing values (geom_point).
Animation has the highest average gross earnings in runtime 100 to 120.
c = movies_df%>%
filter(runtime>=100 & runtime<=120)%>%
group_by(genre)%>%
summarise(highest = mean(gross_earning_in_mil))%>%
arrange(desc(highest))
c
## # A tibble: 8 x 2
## genre highest
## <fct> <dbl>
## 1 Animation 216.
## 2 Adventure 185.
## 3 Action 78.4
## 4 Horror 69.8
## 5 Drama 52.3
## 6 Biography 35.9
## 7 Comedy 31.5
## 8 Crime NA