library(rvest)
library(tidyverse)
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
#Reading the HTML code from the website
webpage <- read_html(url)
#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage, '.text-primary')
#Converting the ranking data to text
rank_data <- html_text(rank_data_html)
#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)
head(rank_data)
## [1] 1 2 3 4 5 6
#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')
#Converting the title data to text
title_data <- html_text(title_data_html)
#Let's have a look at the title
head(title_data)
## [1] "Arrival" "Hacksaw Ridge"
## [3] "Terrifier" "Suicide Squad"
## [5] "Batman v Superman: Dawn of Justice" "Me Before You"
#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)
#Let's have another look at the description data
head(description_data)
## [1] "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."
## [2] "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
## [3] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."
## [4] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [5] "Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."
## [6] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
#Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 116 139 85 123 151 106
#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')
#Converting the genre data to text
genre_data <- html_text(genre_data_html)
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)
#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)
#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)
#Convering each genre from text to factor
genre_data<-as.factor(genre_data)
#Let's have another look at the genre data
head(genre_data)
## [1] Drama Biography Horror Action Action Drama
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
#Converting the ratings data to text
rating_data <- html_text(rating_data_html)
#Data-Preprocessing: converting ratings to numerical
rating_data <- as.numeric(rating_data)
head(rating_data)
## [1] 7.9 8.1 5.6 5.9 6.4 7.4
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
#Converting the votes data to text
votes_data <- html_text(votes_data_html)
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)
#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)
#Let's have another look at the votes data
head(votes_data)
## [1] 722898 553413 43195 701344 714479 268249
length(votes_data)
## [1] 100
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
html_text2()
head(votes_bar_data)
## [1] "Votes: 722,898 | Gross: $100.55M" "Votes: 553,413 | Gross: $67.21M"
## [3] "Votes: 43,195" "Votes: 701,344 | Gross: $325.10M"
## [5] "Votes: 714,479 | Gross: $330.36M" "Votes: 268,249 | Gross: $56.25M"
gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign
gross_data <- substring(gross_data,2,6) %>% # clean data: remove '$' sign
as.numeric()
length(gross_data)
## [1] 100
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)
## [1] "Denis Villeneuve" "Mel Gibson" "Damien Leone" "David Ayer"
## [5] "Zack Snyder" "Thea Sharrock"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)
head(actors_data)
## [1] Amy Adams Andrew Garfield Jenna Kanell Will Smith
## [5] Ben Affleck Emilia Clarke
## 90 Levels: Aamir Khan Adam Driver Alexander Skarsgård ... Zoey Deutch
ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>%
# scrape the ratings bar and convert to text
html_text2()
head(ratings_bar_data) # look at the ratings bar
## [1] "7.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.9/10 X \n81 Metascore"
## [2] "8.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 8.1/10 X \n71 Metascore"
## [3] "5.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.6/10 X "
## [4] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
## [5] "6.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 6.4/10 X \n44 Metascore"
## [6] "7.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.4/10 X \n51 Metascore"
metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%
# extract Metascore
str_match("\\d{2}") %>%
as.numeric() # convert to number
length(metascore_data)
## [1] 100
metascore_data
## [1] 81 71 NA 40 44 51 65 26 94 81 74 70 78 85 62 65 25 48 NA 67 81 43 36 75 59
## [26] 73 72 41 54 88 74 99 21 52 51 48 66 NA 96 68 72 66 76 71 66 57 42 60 51 77
## [51] 38 66 90 61 65 58 58 44 56 59 55 32 40 28 52 NA 47 77 77 65 62 33 81 68 42
## [76] 60 32 79 69 81 60 47 57 42 80 78 39 72 58 78 46 66 35 58 64 45 69 51 23 NA
summary(metascore_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.00 47.00 61.00 60.05 72.50 99.00 5
#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data, Description = description_data, Runtime = runtime_data, Genre = genre_data, Rating = rating_data, Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data, Director = directors_data, Actor = actors_data)
#Structure of the data frame
str(movies_df)
## 'data.frame': 100 obs. of 11 variables:
## $ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "Arrival" "Hacksaw Ridge" "Terrifier" "Suicide Squad" ...
## $ Description : chr "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appea"| __truncated__ "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill peop"| __truncated__ "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ ...
## $ Runtime : num 116 139 85 123 151 106 108 111 128 107 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 7 4 8 1 1 7 1 7 5 3 ...
## $ Rating : num 7.9 8.1 5.6 5.9 6.4 7.4 8 6.5 8 7.6 ...
## $ Metascore : num 81 71 NA 40 44 51 65 26 94 81 ...
## $ Votes : num 722898 553413 43195 701344 714479 ...
## $ Gross_Earning_in_Mil: num 100.5 67.2 NA 325.1 330.3 ...
## $ Director : Factor w/ 97 levels "Alex Proyas",..: 29 62 20 25 97 90 93 83 19 81 ...
## $ Actor : Factor w/ 90 levels "Aamir Khan","Adam Driver",..: 4 5 41 88 8 28 74 10 73 7 ...
Based on the above data, which movie from which Genre had the longest runtime?
Here I created a new variable called “q1” that stands for question 1. In this code I called my newly created dataframe and grouped by genre. After I found the movie with the longest runtime. Which is “Batman v Superman: Dawn of Justice (Ultimate Edition)” from the action genre and has a runtime of 182.
q1 <- movies_df %>%
group_by(Genre) %>%
filter(Runtime == max(Runtime)) %>%
slice_max(order_by = Runtime, n = 1)
q1
## # A tibble: 8 × 11
## # Groups: Genre [8]
## Rank Title Description Runtime Genre Rating Metascore Votes
## <dbl> <chr> <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 19 Batman v Superman: Da… Batman is … 182 Acti… 7.2 NA 69751
## 2 85 American Honey A teenage … 163 Adve… 7 80 44924
## 3 86 A Silent Voice: The M… A young ma… 130 Anim… 8.1 78 90789
## 4 2 Hacksaw Ridge World War … 139 Biog… 8.1 71 553413
## 5 9 La La Land While navi… 128 Come… 8 94 618962
## 6 18 The Girl on the Train A divorcee… 112 Crime 6.5 48 193160
## 7 78 Silence In the 17t… 161 Drama 7.1 79 117156
## 8 55 The Conjuring 2 Ed and Lor… 134 Horr… 7.3 65 283220
## # ℹ 3 more variables: Gross_Earning_in_Mil <dbl>, Director <fct>, Actor <fct>
#view(q1)
#ggplot(q1, aes(x = Genre, y = Runtime)) +
#geom_bar()
p<-ggplot(data=q1, aes(x=Title, y=Runtime, fill = Genre)) +
geom_bar(stat="identity") +
coord_flip()
p
q1x <- movies_df %>%
arrange(desc(Runtime)) %>%
head(10)
q1x
## Rank Title
## 1 19 Batman v Superman: Dawn of Justice (Ultimate Edition)
## 2 85 American Honey
## 3 78 Silence
## 4 100 Dangal
## 5 80 The Wailing
## 6 5 Batman v Superman: Dawn of Justice
## 7 24 Captain America: Civil War
## 8 67 A Cure for Wellness
## 9 14 The Handmaiden
## 10 34 X-Men: Apocalypse
## Description
## 1 Batman is manipulated by Lex Luthor to fear Superman. Superman´s existence is meanwhile dividing the world and he is framed for murder during an international crisis. The heroes clash and force the neutral Wonder Woman to reemerge.
## 2 A teenage girl with nothing to lose joins a traveling magazine sales crew, and gets caught up in a whirlwind of hard partying, law bending and young love as she criss-crosses the Midwest with a band of misfits.
## 3 In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is rumored to have committed apostasy, and to propagate Catholicism.
## 4 Former wrestler Mahavir Singh Phogat and his two wrestler daughters struggle towards glory at the Commonwealth Games in the face of societal oppression.
## 5 Soon after a stranger arrives in a little village, a mysterious sickness starts spreading. A policeman, drawn into the incident, is forced to solve the mystery in order to save his daughter.
## 6 Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs.
## 7 Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man.
## 8 An ambitious young executive is sent to retrieve his company's CEO from an idyllic but mysterious "wellness center" at a remote location in the Swiss Alps, but soon suspects that the spa's treatments are not what they seem.
## 9 A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her.
## 10 In the 1980s the X-Men must defeat an ancient all-powerful mutant, En Sabah Nur, who intends to thrive through bringing destruction to the world.
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil
## 1 182 Action 7.2 NA 69751 NA
## 2 163 Adventure 7.0 80 44924 0.66
## 3 161 Drama 7.1 79 117156 7.10
## 4 161 Action 8.3 NA 198306 12.39
## 5 156 Drama 7.4 81 74250 NA
## 6 151 Action 6.4 44 714479 330.30
## 7 147 Action 7.8 75 815561 408.00
## 8 146 Drama 6.4 47 104530 8.11
## 9 145 Drama 8.1 85 158288 2.01
## 10 144 Action 6.9 52 447279 155.40
## Director Actor
## 1 Zack Snyder Amy Adams
## 2 Andrea Arnold Sasha Lane
## 3 Martin Scorsese Andrew Garfield
## 4 Nitesh Tiwari Aamir Khan
## 5 Na Hong-jin Jun Kunimura
## 6 Zack Snyder Ben Affleck
## 7 Anthony Russo Chris Evans
## 8 Gore Verbinski Dane DeHaan
## 9 Park Chan-wook Kim Min-hee
## 10 Bryan Singer James McAvoy
Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?
Answer: Action Genre. Additionally the top movie is Rogue One: A Star Wars Story. I created a variable that filtered for the runtime and arranged by votes.
q2 <- movies_df %>%
filter(Runtime >= 130, Runtime <=160) %>%
arrange(desc(Votes)) %>%
top_n(10)
## Selecting by Actor
q2
## Rank Title
## 1 16 Rogue One: A Star Wars Story
## 2 34 X-Men: Apocalypse
## 3 55 The Conjuring 2
## 4 14 The Handmaiden
## 5 89 Snowden
## 6 36 13 Hours
## 7 97 Patriots Day
## 8 86 A Silent Voice: The Movie
## 9 80 The Wailing
## 10 59 Race
## Description
## 1 In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction.
## 2 In the 1980s the X-Men must defeat an ancient all-powerful mutant, En Sabah Nur, who intends to thrive through bringing destruction to the world.
## 3 Ed and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit.
## 4 A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her.
## 5 The NSA's illegal surveillance techniques are leaked to the public by one of the agency's employees, Edward Snowden, in the form of thousands of classified documents distributed to the press.
## 6 During an attack on a U.S. compound in Libya, a security team struggles to make sense out of the chaos.
## 7 The story of the 2013 Boston Marathon bombing and the aftermath, which includes the city-wide manhunt to find the terrorists responsible.
## 8 A young man is ostracized by his classmates after he bullies a deaf girl to the point where she moves away. Years later, he sets off on a path for redemption.
## 9 Soon after a stranger arrives in a little village, a mysterious sickness starts spreading. A policeman, drawn into the incident, is forced to solve the mystery in order to save his daughter.
## 10 Jesse Owens' quest to become the greatest track and field athlete in history thrusts him onto the world stage of the 1936 Olympics, where he faces off against Adolf Hitler's vision of Aryan supremacy.
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil
## 1 133 Action 7.8 65 660038 532.10
## 2 144 Action 6.9 52 447279 155.40
## 3 134 Horror 7.3 65 283220 102.40
## 4 145 Drama 8.1 85 158288 2.01
## 5 134 Biography 7.3 58 157352 21.59
## 6 144 Action 7.3 48 151294 52.85
## 7 133 Action 7.3 69 108170 31.89
## 8 130 Animation 8.1 78 90789 NA
## 9 156 Drama 7.4 81 74250 NA
## 10 134 Biography 7.1 56 39273 19.10
## Director Actor
## 1 Gareth Edwards Felicity Jones
## 2 Bryan Singer James McAvoy
## 3 James Wan Vera Farmiga
## 4 Park Chan-wook Kim Min-hee
## 5 Oliver Stone Joseph Gordon-Levitt
## 6 Michael Bay John Krasinski
## 7 Peter Berg Mark Wahlberg
## 8 Naoko Yamada Miyu Irino
## 9 Na Hong-jin Jun Kunimura
## 10 Stephen Hopkins Stephan James
This plot clearly shows that the action genre has the highest votes and shows which movies make up that sum.
options(scipen = 999)
p2<-ggplot(data=q2, aes(x=Genre, y=Votes, fill=Title)) +
geom_bar(stat="identity") +
coord_flip()
p2
Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.
For this answer I filtered out for the specified runtime. Next I calculated the average gross earnings based on genre.
q3 <- movies_df %>%
filter(Runtime >= 100, Runtime <= 120) %>%
group_by(Genre) %>%
summarise(gross_earnings = mean(Gross_Earning_in_Mil)) %>%
arrange(desc(gross_earnings))
head(q3)
## # A tibble: 6 × 2
## Genre gross_earnings
## <fct> <dbl>
## 1 Animation 216.
## 2 Adventure 210.
## 3 Action 89.2
## 4 Horror 69.8
## 5 Drama 55.2
## 6 Comedy 37.4
p4 <- ggplot(data = q3, aes(x = Genre, y = gross_earnings)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip()
p4
head(q3)
## # A tibble: 6 × 2
## Genre gross_earnings
## <fct> <dbl>
## 1 Animation 216.
## 2 Adventure 210.
## 3 Action 89.2
## 4 Horror 69.8
## 5 Drama 55.2
## 6 Comedy 37.4