library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.1
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.1
library(rvest)
## Warning: package 'rvest' was built under R version 4.3.1
library(stringr)
#specifying
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
#reading the HTML code from the website
webpage <- read_html(url)
#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')
#Converting the ranking data to text
rank_data <- html_text(rank_data_html)
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
#Data-Preprocessing: Converting rankings to numerical
rank_data <- as.numeric(rank_data)
#Let's have another look at the rankings
head(rank_data)
## [1] 1 2 3 4 5 6
#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')
#Converting the title data to text
title_data <- html_text(title_data_html)
#Let's have a look at the title
head(title_data)
## [1] "Arrival" "Hacksaw Ridge"
## [3] "Terrifier" "Suicide Squad"
## [5] "Batman v Superman: Dawn of Justice" "Me Before You"
#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#Let's have a look at the description data
head(description_data)
## [1] "\nA linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."
## [2] "\nWorld War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
## [3] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."
## [4] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [5] "\nFearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."
## [6] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)
#Let's have another look at the description data
head(description_data)
## [1] "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."
## [2] "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
## [3] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."
## [4] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [5] "Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."
## [6] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
#Let's have a look at the runtime
head(runtime_data)
## [1] "116 min" "139 min" "85 min" "123 min" "151 min" "106 min"
#Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 116 139 85 123 151 106
#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')
#Converting the genre data to text
genre_data <- html_text(genre_data_html)
#Let's have a look at the runtime
head(genre_data)
## [1] "\nDrama, Mystery, Sci-Fi "
## [2] "\nBiography, Drama, History "
## [3] "\nHorror, Thriller "
## [4] "\nAction, Adventure, Fantasy "
## [5] "\nAction, Adventure, Sci-Fi "
## [6] "\nDrama, Romance "
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)
#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)
#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)
#Convering each genre from text to factor
genre_data<-as.factor(genre_data)
#Let's have another look at the genre data
head(genre_data)
## [1] Drama Biography Horror Action Action Drama
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
#Converting the ratings data to text
rating_data <- html_text(rating_data_html)
#Let's have a look at the ratings
head(rating_data)
## [1] "7.9" "8.1" "5.6" "5.9" "6.4" "7.4"
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)
#Let's have another look at the ratings data
head(rating_data)
## [1] 7.9 8.1 5.6 5.9 6.4 7.4
#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
#Converting the votes data to text
votes_data <- html_text(votes_data_html)
#Let's have a look at the votes data
head(votes_data)
## [1] "722,916" "553,431" "43,200" "701,352" "714,484" "268,259"
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)
#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)
#Let's have another look at the votes data
head(votes_data)
## [1] 722916 553431 43200 701352 714484 268259
#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
#Converting the directors data to text
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)
## [1] "Denis Villeneuve" "Mel Gibson" "Damien Leone" "David Ayer"
## [5] "Zack Snyder" "Thea Sharrock"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)
#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
#Let's have a look at the actors data
head(actors_data)
## [1] "Amy Adams" "Andrew Garfield" "Jenna Kanell" "Will Smith"
## [5] "Ben Affleck" "Emilia Clarke"
#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)
#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')
#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
#Let's have a look at the metascore
head(metascore_data)
## [1] "81 " "71 " "40 " "44 " "51 "
## [6] "65 "
#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)
#Lets check the length of metascore data
length(metascore_data)
## [1] 95
ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>%
# scrape the ratings bar and convert to text
html_text2()
head(ratings_bar_data) # look at the ratings bar
## [1] "7.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.9/10 X \n81 Metascore"
## [2] "8.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 8.1/10 X \n71 Metascore"
## [3] "5.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.6/10 X "
## [4] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
## [5] "6.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 6.4/10 X \n44 Metascore"
## [6] "7.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.4/10 X \n51 Metascore"
metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%
# extract Metascore
str_match("\\d{2}") %>%
as.numeric() # convert to number
length(metascore_data)
## [1] 100
metascore_data
## [1] 81 71 NA 40 44 51 65 26 94 81 74 70 78 85 62 65 25 48 NA 67 81 43 36 75 59
## [26] 73 72 41 54 88 74 99 21 52 51 48 66 NA 96 68 72 66 76 71 66 57 42 60 51 77
## [51] 38 66 90 61 65 58 58 44 56 59 55 32 40 28 52 NA 47 77 77 65 62 33 81 68 42
## [76] 60 32 79 69 81 60 47 57 42 80 78 39 72 58 78 46 66 35 58 64 45 69 51 23 NA
summary(metascore_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.00 47.00 61.00 60.05 72.50 99.00 5
#Using CSS selectors to scrape the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
#Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
#Let's have a look at the votes data
head(gross_data)
## [1] "$100.55M" "$67.21M" "$325.10M" "$330.36M" "$56.25M" "$363.07M"
# scrape the votess bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
html_text2()
head(votes_bar_data) # look at the votes bar data
## [1] "Votes: 722,916 | Gross: $100.55M" "Votes: 553,431 | Gross: $67.21M"
## [3] "Votes: 43,200" "Votes: 701,352 | Gross: $325.10M"
## [5] "Votes: 714,484 | Gross: $330.36M" "Votes: 268,259 | Gross: $56.25M"
gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign
gross_data <- substring(gross_data,2,6) %>% # clean data: remove '$' sign
as.numeric()
length(gross_data)
## [1] 100
#Data-Preprocessing: converting gross to numerical
gross_data <-as.numeric(gross_data)
#Let's have another look at the length of gross data
length(gross_data)
## [1] 100
summary(gross_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.01 24.23 56.25 99.25 125.80 532.10 9
#Combining all the lists to form a data frame
movies_df <- data.frame(Rank = rank_data, Title = title_data,
Description = description_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data,
Director = directors_data, Actor = actors_data)
#Structure of the data frame
str(movies_df)
## 'data.frame': 100 obs. of 11 variables:
## $ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "Arrival" "Hacksaw Ridge" "Terrifier" "Suicide Squad" ...
## $ Description : chr "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appea"| __truncated__ "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill peop"| __truncated__ "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ ...
## $ Runtime : num 116 139 85 123 151 106 108 111 128 107 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 7 4 8 1 1 7 1 7 5 3 ...
## $ Rating : num 7.9 8.1 5.6 5.9 6.4 7.4 8 6.5 8 7.6 ...
## $ Metascore : num 81 71 NA 40 44 51 65 26 94 81 ...
## $ Votes : num 722916 553431 43200 701352 714484 ...
## $ Gross_Earning_in_Mil: num 100.5 67.2 NA 325.1 330.3 ...
## $ Director : Factor w/ 97 levels "Alex Proyas",..: 29 62 20 25 97 90 93 83 19 81 ...
## $ Actor : Factor w/ 90 levels "Aamir Khan","Adam Driver",..: 4 5 41 88 8 28 74 10 73 7 ...
library('ggplot2')
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))
## Warning: Removed 9 rows containing missing values (`geom_point()`).

Question 1: Based on the above data, which movie from which Genre
had the longest runtime?
#Question 1: Based on the above data, which movie from which Genre had the longest runtime?
movies_df %>%
filter(Runtime == max(Runtime))
## Rank Title
## 1 19 Batman v Superman: Dawn of Justice (Ultimate Edition)
## Description
## 1 Batman is manipulated by Lex Luthor to fear Superman. Superman´s existence is meanwhile dividing the world and he is framed for murder during an international crisis. The heroes clash and force the neutral Wonder Woman to reemerge.
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil Director
## 1 182 Action 7.2 NA 69763 NA Zack Snyder
## Actor
## 1 Amy Adams
#Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?
Genre that has the longest run time is Action. Batman vs Superman:
Dawn of Justice (Ultimate Edition) has a run time of 182 mins
#Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?
movies_df %>%
filter(Runtime > 130 & Runtime < 160) %>%
arrange(desc(Votes))
## Rank Title
## 1 24 Captain America: Civil War
## 2 5 Batman v Superman: Dawn of Justice
## 3 16 Rogue One: A Star Wars Story
## 4 2 Hacksaw Ridge
## 5 37 Fantastic Beasts and Where to Find Them
## 6 34 X-Men: Apocalypse
## 7 39 Manchester by the Sea
## 8 55 The Conjuring 2
## 9 29 The Magnificent Seven
## 10 14 The Handmaiden
## 11 89 Snowden
## 12 36 13 Hours
## 13 97 Patriots Day
## 14 67 A Cure for Wellness
## 15 90 The Lost City of Z
## 16 80 The Wailing
## 17 59 Race
## Description
## 1 Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man.
## 2 Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs.
## 3 In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction.
## 4 World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot.
## 5 The adventures of writer Newt Scamander in New York's secret community of witches and wizards seventy years before Harry Potter reads his book in school.
## 6 In the 1980s the X-Men must defeat an ancient all-powerful mutant, En Sabah Nur, who intends to thrive through bringing destruction to the world.
## 7 A depressed uncle is asked to take care of his teenage nephew after the boy's father dies.
## 8 Ed and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit.
## 9 Seven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist.
## 10 A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her.
## 11 The NSA's illegal surveillance techniques are leaked to the public by one of the agency's employees, Edward Snowden, in the form of thousands of classified documents distributed to the press.
## 12 During an attack on a U.S. compound in Libya, a security team struggles to make sense out of the chaos.
## 13 The story of the 2013 Boston Marathon bombing and the aftermath, which includes the city-wide manhunt to find the terrorists responsible.
## 14 An ambitious young executive is sent to retrieve his company's CEO from an idyllic but mysterious "wellness center" at a remote location in the Swiss Alps, but soon suspects that the spa's treatments are not what they seem.
## 15 A true-life drama, centering on British explorer Major Percival Fawcett, who disappeared whilst searching for a mysterious city in the Amazon in the 1920s.
## 16 Soon after a stranger arrives in a little village, a mysterious sickness starts spreading. A policeman, drawn into the incident, is forced to solve the mystery in order to save his daughter.
## 17 Jesse Owens' quest to become the greatest track and field athlete in history thrusts him onto the world stage of the 1936 Olympics, where he faces off against Adolf Hitler's vision of Aryan supremacy.
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil
## 1 147 Action 7.8 75 815574 408.00
## 2 151 Action 6.4 44 714484 330.30
## 3 133 Action 7.8 65 660049 532.10
## 4 139 Biography 8.1 71 553431 67.21
## 5 132 Adventure 7.2 66 489886 234.00
## 6 144 Action 6.9 52 447284 155.40
## 7 137 Drama 7.8 96 295022 47.70
## 8 134 Horror 7.3 65 283227 102.40
## 9 132 Action 6.9 54 220031 93.43
## 10 145 Drama 8.1 85 158295 2.01
## 11 134 Biography 7.3 58 157354 21.59
## 12 144 Action 7.3 48 151297 52.85
## 13 133 Action 7.3 69 108170 31.89
## 14 146 Drama 6.4 47 104534 8.11
## 15 141 Adventure 6.6 78 94916 8.58
## 16 156 Drama 7.4 81 74254 NA
## 17 134 Biography 7.1 56 39274 19.10
## Director Actor
## 1 Anthony Russo Chris Evans
## 2 Zack Snyder Ben Affleck
## 3 Gareth Edwards Felicity Jones
## 4 Mel Gibson Andrew Garfield
## 5 David Yates Eddie Redmayne
## 6 Bryan Singer James McAvoy
## 7 Kenneth Lonergan Casey Affleck
## 8 James Wan Vera Farmiga
## 9 Antoine Fuqua Denzel Washington
## 10 Park Chan-wook Kim Min-hee
## 11 Oliver Stone Joseph Gordon-Levitt
## 12 Michael Bay John Krasinski
## 13 Peter Berg Mark Wahlberg
## 14 Gore Verbinski Dane DeHaan
## 15 James Gray Charlie Hunnam
## 16 Na Hong-jin Jun Kunimura
## 17 Stephen Hopkins Stephan James
Highest genre votes between 130mins to 160 mins is action and the
most votes goes to Pirates of the Carribean
#Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.
movies_df %>%
filter(Runtime > 100 & Runtime < 120) %>%
group_by(Genre) %>%
summarise(highest_gross_earnings = mean(Gross_Earning_in_Mil, na.rm = TRUE)) %>%
arrange(desc(highest_gross_earnings))
## # A tibble: 8 × 2
## Genre highest_gross_earnings
## <fct> <dbl>
## 1 Animation 216.
## 2 Adventure 210.
## 3 Action 88.3
## 4 Horror 69.8
## 5 Drama 55.2
## 6 Crime 51.1
## 7 Biography 35.9
## 8 Comedy 24.8
The genre that has the highest gross earnings is Animation with 216
millions