Web Scraping Homework

Load library’s

library(rvest)
library(tidyverse)

Set webpage

url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
#Reading the HTML code from the website
webpage <- read_html(url)

Scraping for rank

#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage, '.text-primary')
#Converting the ranking data to text
rank_data <- html_text(rank_data_html)
#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)
head(rank_data)
## [1] 1 2 3 4 5 6

Scraping the title section

#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')
#Converting the title data to text
title_data <- html_text(title_data_html)
#Let's have a look at the title
head(title_data)
## [1] "Arrival"                            "Hacksaw Ridge"                     
## [3] "Terrifier"                          "Suicide Squad"                     
## [5] "Batman v Superman: Dawn of Justice" "Me Before You"

Description section

#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)
#Let's have another look at the description data 
head(description_data)
## [1] "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."                                                                                  
## [2] "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
## [3] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                        
## [4] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                 
## [5] "Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."                                                       
## [6] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."

Movie runtime section

runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
#Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
## [1] 116 139  85 123 151 106

Movie genre section

#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')
#Converting the genre data to text
genre_data <- html_text(genre_data_html)
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)
#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)
#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)
#Convering each genre from text to factor
genre_data<-as.factor(genre_data)
#Let's have another look at the genre data
head(genre_data)
## [1] Drama     Biography Horror    Action    Action    Drama    
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror

IMDB rating section

#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
#Converting the ratings data to text
rating_data <- html_text(rating_data_html)
#Data-Preprocessing: converting ratings to numerical
rating_data <- as.numeric(rating_data)
head(rating_data)
## [1] 7.9 8.1 5.6 5.9 6.4 7.4

Votes section

votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
#Converting the votes data to text
votes_data <- html_text(votes_data_html)
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)
#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)
#Let's have another look at the votes data
head(votes_data)
## [1] 722898 553413  43195 701344 714479 268249
length(votes_data)
## [1] 100

Gross Earnings

votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
 html_text2()
head(votes_bar_data)
## [1] "Votes: 722,898 | Gross: $100.55M" "Votes: 553,413 | Gross: $67.21M" 
## [3] "Votes: 43,195"                    "Votes: 701,344 | Gross: $325.10M"
## [5] "Votes: 714,479 | Gross: $330.36M" "Votes: 268,249 | Gross: $56.25M"
gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign
gross_data <- substring(gross_data,2,6) %>% # clean data: remove '$' sign
 as.numeric()
length(gross_data)
## [1] 100

Directors section

directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)
## [1] "Denis Villeneuve" "Mel Gibson"       "Damien Leone"     "David Ayer"      
## [5] "Zack Snyder"      "Thea Sharrock"
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)

Actors section

actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)
head(actors_data)
## [1] Amy Adams       Andrew Garfield Jenna Kanell    Will Smith     
## [5] Ben Affleck     Emilia Clarke  
## 90 Levels: Aamir Khan Adam Driver Alexander Skarsgård ... Zoey Deutch

Metascore

ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>%
# scrape the ratings bar and convert to text
 html_text2()
head(ratings_bar_data) # look at the ratings bar
## [1] "7.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.9/10 X \n81 Metascore"
## [2] "8.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 8.1/10 X \n71 Metascore"
## [3] "5.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.6/10 X "              
## [4] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
## [5] "6.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 6.4/10 X \n44 Metascore"
## [6] "7.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.4/10 X \n51 Metascore"
metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%
# extract Metascore
 str_match("\\d{2}") %>%
 as.numeric() # convert to number
length(metascore_data)
## [1] 100
metascore_data
##   [1] 81 71 NA 40 44 51 65 26 94 81 74 70 78 85 62 65 25 48 NA 67 81 43 36 75 59
##  [26] 73 72 41 54 88 74 99 21 52 51 48 66 NA 96 68 72 66 76 71 66 57 42 60 51 77
##  [51] 38 66 90 61 65 58 58 44 56 59 55 32 40 28 52 NA 47 77 77 65 62 33 81 68 42
##  [76] 60 32 79 69 81 60 47 57 42 80 78 39 72 58 78 46 66 35 58 64 45 69 51 23 NA
summary(metascore_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   47.00   61.00   60.05   72.50   99.00       5

Dataframe

#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data, Description = description_data, Runtime = runtime_data, Genre = genre_data, Rating = rating_data, Metascore = metascore_data, Votes = votes_data,             Gross_Earning_in_Mil = gross_data, Director = directors_data, Actor = actors_data)
#Structure of the data frame

str(movies_df)
## 'data.frame':    100 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr  "Arrival" "Hacksaw Ridge" "Terrifier" "Suicide Squad" ...
##  $ Description         : chr  "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appea"| __truncated__ "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill peop"| __truncated__ "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ ...
##  $ Runtime             : num  116 139 85 123 151 106 108 111 128 107 ...
##  $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 7 4 8 1 1 7 1 7 5 3 ...
##  $ Rating              : num  7.9 8.1 5.6 5.9 6.4 7.4 8 6.5 8 7.6 ...
##  $ Metascore           : num  81 71 NA 40 44 51 65 26 94 81 ...
##  $ Votes               : num  722898 553413 43195 701344 714479 ...
##  $ Gross_Earning_in_Mil: num  100.5 67.2 NA 325.1 330.3 ...
##  $ Director            : Factor w/ 97 levels "Alex Proyas",..: 29 62 20 25 97 90 93 83 19 81 ...
##  $ Actor               : Factor w/ 90 levels "Aamir Khan","Adam Driver",..: 4 5 41 88 8 28 74 10 73 7 ...

Filter for HW Question 1

Based on the above data, which movie from which Genre had the longest runtime?

Here I created a new variable called “q1” that stands for question 1. In this code I called my newly created dataframe and grouped by genre. After I found the movie with the longest runtime. Which is “Batman v Superman: Dawn of Justice (Ultimate Edition)” from the action genre and has a runtime of 182.

q1 <- movies_df %>% 
  group_by(Genre) %>%
 filter(Runtime == max(Runtime)) %>%
  slice_max(order_by = Runtime, n = 1)
  q1
## # A tibble: 8 × 11
## # Groups:   Genre [8]
##    Rank Title                  Description Runtime Genre Rating Metascore  Votes
##   <dbl> <chr>                  <chr>         <dbl> <fct>  <dbl>     <dbl>  <dbl>
## 1    19 Batman v Superman: Da… Batman is …     182 Acti…    7.2        NA  69751
## 2    85 American Honey         A teenage …     163 Adve…    7          80  44924
## 3    86 A Silent Voice: The M… A young ma…     130 Anim…    8.1        78  90789
## 4     2 Hacksaw Ridge          World War …     139 Biog…    8.1        71 553413
## 5     9 La La Land             While navi…     128 Come…    8          94 618962
## 6    18 The Girl on the Train  A divorcee…     112 Crime    6.5        48 193160
## 7    78 Silence                In the 17t…     161 Drama    7.1        79 117156
## 8    55 The Conjuring 2        Ed and Lor…     134 Horr…    7.3        65 283220
## # ℹ 3 more variables: Gross_Earning_in_Mil <dbl>, Director <fct>, Actor <fct>
  #view(q1)
#ggplot(q1, aes(x = Genre, y = Runtime)) + 
 #geom_bar()
  p<-ggplot(data=q1, aes(x=Title, y=Runtime, fill = Genre)) +
  geom_bar(stat="identity") +
  coord_flip()
p

A second way of filtering for answer

q1x <- movies_df %>%
  arrange(desc(Runtime)) %>%
  head(10)
q1x
##    Rank                                                 Title
## 1    19 Batman v Superman: Dawn of Justice (Ultimate Edition)
## 2    85                                        American Honey
## 3    78                                               Silence
## 4   100                                                Dangal
## 5    80                                           The Wailing
## 6     5                    Batman v Superman: Dawn of Justice
## 7    24                            Captain America: Civil War
## 8    67                                   A Cure for Wellness
## 9    14                                        The Handmaiden
## 10   34                                     X-Men: Apocalypse
##                                                                                                                                                                                                                                Description
## 1  Batman is manipulated by Lex Luthor to fear Superman. Superman´s existence is meanwhile dividing the world and he is framed for murder during an international crisis. The heroes clash and force the neutral Wonder Woman to reemerge.
## 2                       A teenage girl with nothing to lose joins a traveling magazine sales crew, and gets caught up in a whirlwind of hard partying, law bending and young love as she criss-crosses the Midwest with a band of misfits.
## 3                                                        In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is rumored to have committed apostasy, and to propagate Catholicism.
## 4                                                                                 Former wrestler Mahavir Singh Phogat and his two wrestler daughters struggle towards glory at the Commonwealth Games in the face of societal oppression.
## 5                                           Soon after a stranger arrives in a little village, a mysterious sickness starts spreading. A policeman, drawn into the incident, is forced to solve the mystery in order to save his daughter.
## 6                                                                            Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs.
## 7                                                                                                                                       Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man.
## 8          An ambitious young executive is sent to retrieve his company's CEO from an idyllic but mysterious "wellness center" at a remote location in the Swiss Alps, but soon suspects that the spa's treatments are not what they seem.
## 9                                                                                                                           A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her.
## 10                                                                                       In the 1980s the X-Men must defeat an ancient all-powerful mutant, En Sabah Nur, who intends to thrive through bringing destruction to the world.
##    Runtime     Genre Rating Metascore  Votes Gross_Earning_in_Mil
## 1      182    Action    7.2        NA  69751                   NA
## 2      163 Adventure    7.0        80  44924                 0.66
## 3      161     Drama    7.1        79 117156                 7.10
## 4      161    Action    8.3        NA 198306                12.39
## 5      156     Drama    7.4        81  74250                   NA
## 6      151    Action    6.4        44 714479               330.30
## 7      147    Action    7.8        75 815561               408.00
## 8      146     Drama    6.4        47 104530                 8.11
## 9      145     Drama    8.1        85 158288                 2.01
## 10     144    Action    6.9        52 447279               155.40
##           Director           Actor
## 1      Zack Snyder       Amy Adams
## 2    Andrea Arnold      Sasha Lane
## 3  Martin Scorsese Andrew Garfield
## 4    Nitesh Tiwari      Aamir Khan
## 5      Na Hong-jin    Jun Kunimura
## 6      Zack Snyder     Ben Affleck
## 7    Anthony Russo     Chris Evans
## 8   Gore Verbinski     Dane DeHaan
## 9   Park Chan-wook     Kim Min-hee
## 10    Bryan Singer    James McAvoy

Homework question 2

Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

Answer: Action Genre. Additionally the top movie is Rogue One: A Star Wars Story. I created a variable that filtered for the runtime and arranged by votes.

q2 <- movies_df %>% 
  filter(Runtime >= 130, Runtime <=160) %>% 
  arrange(desc(Votes)) %>% 
  top_n(10)
## Selecting by Actor
q2
##    Rank                        Title
## 1    16 Rogue One: A Star Wars Story
## 2    34            X-Men: Apocalypse
## 3    55              The Conjuring 2
## 4    14               The Handmaiden
## 5    89                      Snowden
## 6    36                     13 Hours
## 7    97                 Patriots Day
## 8    86    A Silent Voice: The Movie
## 9    80                  The Wailing
## 10   59                         Race
##                                                                                                                                                                                                 Description
## 1                                           In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction.
## 2                                                         In the 1980s the X-Men must defeat an ancient all-powerful mutant, En Sabah Nur, who intends to thrive through bringing destruction to the world.
## 3                                                            Ed and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit.
## 4                                                                                            A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her.
## 5           The NSA's illegal surveillance techniques are leaked to the public by one of the agency's employees, Edward Snowden, in the form of thousands of classified documents distributed to the press.
## 6                                                                                                   During an attack on a U.S. compound in Libya, a security team struggles to make sense out of the chaos.
## 7                                                                 The story of the 2013 Boston Marathon bombing and the aftermath, which includes the city-wide manhunt to find the terrorists responsible.
## 8                                            A young man is ostracized by his classmates after he bullies a deaf girl to the point where she moves away. Years later, he sets off on a path for redemption.
## 9            Soon after a stranger arrives in a little village, a mysterious sickness starts spreading. A policeman, drawn into the incident, is forced to solve the mystery in order to save his daughter.
## 10 Jesse Owens' quest to become the greatest track and field athlete in history thrusts him onto the world stage of the 1936 Olympics, where he faces off against Adolf Hitler's vision of Aryan supremacy.
##    Runtime     Genre Rating Metascore  Votes Gross_Earning_in_Mil
## 1      133    Action    7.8        65 660038               532.10
## 2      144    Action    6.9        52 447279               155.40
## 3      134    Horror    7.3        65 283220               102.40
## 4      145     Drama    8.1        85 158288                 2.01
## 5      134 Biography    7.3        58 157352                21.59
## 6      144    Action    7.3        48 151294                52.85
## 7      133    Action    7.3        69 108170                31.89
## 8      130 Animation    8.1        78  90789                   NA
## 9      156     Drama    7.4        81  74250                   NA
## 10     134 Biography    7.1        56  39273                19.10
##           Director                Actor
## 1   Gareth Edwards       Felicity Jones
## 2     Bryan Singer         James McAvoy
## 3        James Wan         Vera Farmiga
## 4   Park Chan-wook          Kim Min-hee
## 5     Oliver Stone Joseph Gordon-Levitt
## 6      Michael Bay       John Krasinski
## 7       Peter Berg        Mark Wahlberg
## 8     Naoko Yamada           Miyu Irino
## 9      Na Hong-jin         Jun Kunimura
## 10 Stephen Hopkins        Stephan James

Plot 2

This plot clearly shows that the action genre has the highest votes and shows which movies make up that sum.

options(scipen = 999)
 p2<-ggplot(data=q2, aes(x=Genre, y=Votes, fill=Title)) +
  geom_bar(stat="identity") +
  coord_flip()
 p2

Homework Question 3

Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

For this answer I filtered out for the specified runtime. Next I calculated the average gross earnings based on genre.

q3 <- movies_df %>%
  filter(Runtime >= 100, Runtime <= 120) %>%
  group_by(Genre) %>%
  summarise(gross_earnings = mean(Gross_Earning_in_Mil)) %>%
  arrange(desc(gross_earnings))

head(q3)
## # A tibble: 6 × 2
##   Genre     gross_earnings
##   <fct>              <dbl>
## 1 Animation          216. 
## 2 Adventure          210. 
## 3 Action              89.2
## 4 Horror              69.8
## 5 Drama               55.2
## 6 Comedy              37.4
p4 <- ggplot(data = q3, aes(x = Genre, y = gross_earnings)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip()

p4

Additional code to prove my answer

head(q3)
## # A tibble: 6 × 2
##   Genre     gross_earnings
##   <fct>              <dbl>
## 1 Animation          216. 
## 2 Adventure          210. 
## 3 Action              89.2
## 4 Horror              69.8
## 5 Drama               55.2
## 6 Comedy              37.4

Thank you!