DATA 110 - Webscraping Assignment

Author

N. Yasmin Bromir

Intro

#Loading the rvest package
library('rvest')

#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

#Reading the HTML code from the website
webpage <- read_html(url)

Getting the Article Rankings

#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

#Converting the ranking data to text
rank_data <- html_text(rank_data_html)

#Let's have a look at the rankings
head(rank_data)

[1] "1." "2." "3." "4." "5." "6."

#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

#Let's have another look at the rankings
head(rank_data)

[1] 1 2 3 4 5 6

Getting the Movie Titles

#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)

#Let's have a look at the title
head(title_data)

[1] "Arrival"                            "Hacksaw Ridge"                     
[3] "Terrifier"                          "Suicide Squad"                     
[5] "Batman v Superman: Dawn of Justice" "Me Before You"

Scraping Descriptions

#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

#Let's have a look at the description data
head(description_data)

[1] "\nA linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."                                                                                  
[2] "\nWorld War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
[3] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                        
[4] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                 
[5] "\nFearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."                                                       
[6] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."

#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

#Let's have another look at the description data
head(description_data)

[1] "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."                                                                                  
[2] "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
[3] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                        
[4] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                 
[5] "Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."                                                       
[6] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."

Scraping Movie Runtime

#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

#Let's have a look at the runtime
head(runtime_data)

[1] "116 min" "139 min" "85 min"  "123 min" "151 min" "106 min"

#Data-Preprocessing: removing mins and converting it to numerical

runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

#Let's have another look at the runtime data
head(runtime_data)

[1] 116 139  85 123 151 106

Scraping Movie Genre

#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

#Let's have a look at the runtime
head(genre_data)

[1] "\nDrama, Mystery, Sci-Fi            "    
[2] "\nBiography, Drama, History            " 
[3] "\nHorror, Thriller            "          
[4] "\nAction, Adventure, Fantasy            "
[5] "\nAction, Adventure, Sci-Fi            " 
[6] "\nDrama, Romance            "

#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

#Convering each genre from text to factor
genre_data<-as.factor(genre_data)

#Let's have another look at the genre data
head(genre_data)

[1] Drama     Biography Horror    Action    Action    Drama    
Levels: Action Adventure Animation Biography Comedy Crime Drama Horror

Scraping IMDB Movie Rating

#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

#Let's have a look at the ratings
head(rating_data)

[1] "7.9" "8.1" "5.6" "5.9" "6.4" "7.4"

#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

#Let's have another look at the ratings data
head(rating_data)

[1] 7.9 8.1 5.6 5.9 6.4 7.4

Scraping the Votes Section

#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

#Converting the votes data to text
votes_data <- html_text(votes_data_html)

#Let's have a look at the votes data
head(votes_data)

[1] "723,144" "553,644" "43,253"  "701,445" "714,622" "268,350"

#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

#Let's have another look at the votes data
head(votes_data)

[1] 723144 553644  43253 701445 714622 268350

Scraping Director’s Data

#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

#Converting the directors data to text
directors_data <- html_text(directors_data_html)

#Let's have a look at the directors data
head(directors_data)

[1] "Denis Villeneuve" "Mel Gibson"       "Damien Leone"     "David Ayer"      
[5] "Zack Snyder"      "Thea Sharrock"

#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)

Scraping Actors

#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

#Let's have a look at the actors data
head(actors_data)

[1] "Amy Adams"       "Andrew Garfield" "Jenna Kanell"    "Will Smith"     
[5] "Ben Affleck"     "Emilia Clarke"

#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)

Scraping Metascore

#install.packages(“stringr”)
library(stringr)

ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>% 
# scrape the ratings bar and convert to text
html_text2()
head(ratings_bar_data)

[1] "7.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.9/10 X \n81 Metascore"
[2] "8.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 8.1/10 X \n71 Metascore"
[3] "5.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.6/10 X "              
[4] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
[5] "6.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 6.4/10 X \n44 Metascore"
[6] "7.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.4/10 X \n51 Metascore"

# look at the ratings bar
metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%  
# extract Metascore 
 str_match("\\d{2}") %>% as.numeric()

# convert to number  
length(metascore_data)

[1] 100

metascore_data

  [1] 81 71 NA 40 44 51 65 26 94 81 74 70 78 85 62 65 25 48 NA 67 81 43 36 75 59
 [26] 73 72 41 54 88 74 99 21 52 51 48 66 NA 96 68 72 66 76 71 66 57 42 60 51 77
 [51] 38 66 90 61 65 58 58 44 56 59 55 32 40 28 52 NA 47 77 77 65 62 33 81 68 42
 [76] 60 32 79 69 81 60 47 57 42 80 78 39 72 58 78 46 66 35 58 64 45 69 51 23 NA

summary(metascore_data)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  21.00   47.00   61.00   60.05   72.50   99.00       5

Code from Online Article

#Using CSS selectors to scrape the metascore section metascore_data_html <- html_nodes(webpage,‘.metascore’)

#Converting the runtime data to text metascore_data <- html_text(metascore_data_html)

#Let’s have a look at the metascore head(metascore_data)

#Data-Preprocessing: removing extra space in metascore metascore_data<-gsub(” “,”“,metascore_data)

#Lets check the length of metascore data length(metascore_data) for (i in c(3, 19, 38, 66, 100)){ a<-metascore_data[1:(i-1)] b<-metascore_data[i:length(metascore_data)] metascore_data <- append(a, list(“NA”)) metascore_data <- append(metascore_data,b) }

#Data-Preprocessing: converting metascore to numerical metascore_data <- as.numeric(metascore_data)

#Let’s have another look at length of the metascore data length(metascore_data)

Scrapping Gross Variable

# Professor's Code

# scrape the votes bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
  html_text2()
head(votes_bar_data)

[1] "Votes: 723,144 | Gross: $100.55M" "Votes: 553,644 | Gross: $67.21M" 
[3] "Votes: 43,253"                    "Votes: 701,445 | Gross: $325.10M"
[5] "Votes: 714,622 | Gross: $330.36M" "Votes: 268,350 | Gross: $56.25M"

# look at the votes bar data
gross_data <- str_match(votes_bar_data, "\\$.+$")

# extract the gross earnings
gross_data <- gsub("M","",gross_data)

# clean data: remove 'M' sign 
gross_data <- substring(gross_data,2,6) %>%
  # clean data: remove '$' sign                    
  as.numeric()
length(gross_data)

[1] 100

Processing Gross Data through Using Online Article Code

#Using CSS selectors to scrape the gross revenue section gross_data_html <- html_nodes(webpage,‘.ghost~ .text-muted+ span’)

#Converting the gross revenue data to text gross_data <- html_text(gross_data_html)

#Let’s have a look at the votes data head(gross_data)

#Data-Preprocessing: removing ‘$’ and ‘M’ signs gross_data<-gsub(“M”,““,gross_data)

gross_data<-substring(gross_data,2,6)

#Let’s check the length of gross data length(gross_data)

#Filling missing entries with NA for (i in c(3, 19, 38, 47, 80, 86, 92, 99)){ a <- gross_data[1:(i-1)] b <- gross_data[i:length(gross_data)] gross_data <- append(a, list(“NA”)) gross_data <- append(gross_data,b) }

#Data-Preprocessing: converting gross to numerical gross_data <- as.numeric(gross_data)

#Let’s have another look at the length of gross data length(gross_data)

summary(gross_data)

Putting the Data Frame Together

#Combining all the lists to form a data frame
movies_df <- data.frame(Rank = rank_data,
                        Title = title_data,
                        Description = description_data,
                        Runtime = runtime_data,
                        Genre = genre_data,
                        Rating = rating_data,
                        Metascore = metascore_data,
                        Votes = votes_data,
                        Gross_Earning_in_Mil = gross_data,
                        Director = directors_data,
                        Actor = actors_data)

#Structure of the data frame

str(movies_df)

'data.frame':   100 obs. of  11 variables:
 $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
 $ Title               : chr  "Arrival" "Hacksaw Ridge" "Terrifier" "Suicide Squad" ...
 $ Description         : chr  "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appea"| __truncated__ "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill peop"| __truncated__ "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ ...
 $ Runtime             : num  116 139 85 123 151 106 108 111 128 107 ...
 $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 7 4 8 1 1 7 1 7 5 3 ...
 $ Rating              : num  7.9 8.1 5.6 5.9 6.4 7.4 8 6.5 8 7.6 ...
 $ Metascore           : num  81 71 NA 40 44 51 65 26 94 81 ...
 $ Votes               : num  723144 553644 43253 701445 714622 ...
 $ Gross_Earning_in_Mil: num  100.5 67.2 NA 325.1 330.3 ...
 $ Director            : Factor w/ 97 levels "Alex Proyas",..: 29 62 20 25 97 90 93 83 19 81 ...
 $ Actor               : Factor w/ 90 levels "Aamir Khan","Adam Driver",..: 4 5 41 88 8 28 74 10 73 7 ...

Questions to Answer

Question 1:

Based on the above data, which movie from which genre had the longest runtime?

Answer: Batman v Superman: Dawn of Justice (Ultimate Edition)

library(dplyr)

Warning: package 'dplyr' was built under R version 4.2.3


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(ggplot2)

Warning: package 'ggplot2' was built under R version 4.2.3

ggplot(movies_df, aes(x = Runtime,y = Rating)) +
  geom_point(aes(size = Votes, col = Genre))

filter(movies_df, Runtime >= 175 & Genre == "Action")

  Rank                                                 Title
1   19 Batman v Superman: Dawn of Justice (Ultimate Edition)
                                                                                                                                                                                                                              Description
1 Batman is manipulated by Lex Luthor to fear Superman. Superman´s existence is meanwhile dividing the world and he is framed for murder during an international crisis. The heroes clash and force the neutral Wonder Woman to reemerge.
  Runtime  Genre Rating Metascore Votes Gross_Earning_in_Mil    Director
1     182 Action    7.2        NA 69932                   NA Zack Snyder
      Actor
1 Amy Adams

Question 2:

Based on the above data, in the runtime of 130-160 mins, which genre has the highest votes?

Answer: Captain America: Civil War

ggplot(movies_df,aes(x = Runtime, y = Gross_Earning_in_Mil)) +
  geom_point(aes(size = Rating, col = Genre)) +
  labs(y = "Gross Earnings ($ Millions)")

Warning: Removed 9 rows containing missing values (`geom_point()`).

movies_df %>% 
  filter(Runtime <= 160 & Runtime >= 130) %>%
  filter(Votes == max(Votes))

  Rank                      Title
1   24 Captain America: Civil War
                                                                                         Description
1 Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man.
  Runtime  Genre Rating Metascore  Votes Gross_Earning_in_Mil      Director
1     147 Action    7.8        75 815742                  408 Anthony Russo
        Actor
1 Chris Evans

Question 3:

Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120?

Answer: Animation

movies_df %>% 
  filter(Runtime <= 120 & Runtime >= 100) %>%
  group_by(Genre) %>% 
  summarize(avgGross = mean(Gross_Earning_in_Mil)) %>% 
  arrange(desc(avgGross))

# A tibble: 8 × 2
  Genre     avgGross
  <fct>        <dbl>
1 Animation    216. 
2 Adventure    210. 
3 Action        89.2
4 Horror        69.8
5 Drama         55.2
6 Comedy        37.4
7 Biography     35.9
8 Crime         NA