Load Package

library(rvest)

Set WD

setwd("/Users/smhenderson/Desktop/DATA110/Assignments")

Specifying the url for desired website to be scraped

url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

Reading the HTML code from the website

webpage <- read_html(url)

Rankings

#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

#Converting the ranking data to text
rank_data <- html_text(rank_data_html)

#Let's have a look at the rankings
head(rank_data)

## [1] "1." "2." "3." "4." "5." "6."

#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

#Let's have another look at the rankings
head(rank_data)

## [1] 1 2 3 4 5 6

Titles

#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)

#Let's have a look at the title
head(title_data)

## [1] "Arrival"                            "Hacksaw Ridge"                     
## [3] "Terrifier"                          "Suicide Squad"                     
## [5] "Batman v Superman: Dawn of Justice" "Me Before You"

Descriptions

#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

#Let's have a look at the description data
head(description_data)

## [1] "\nA linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."                                                                                  
## [2] "\nWorld War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
## [3] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                        
## [4] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                 
## [5] "\nFearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."                                                       
## [6] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."

#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

#Let's have another look at the description data 
head(description_data)

## [1] "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."                                                                                  
## [2] "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
## [3] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                        
## [4] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                 
## [5] "Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."                                                       
## [6] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."

Movie runtime

#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

#Let's have a look at the runtime
head(runtime_data)

## [1] "116 min" "139 min" "85 min"  "123 min" "151 min" "106 min"

#Data-Preprocessing: removing mins and converting it to numerical

runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

#Let's have another look at the runtime data
head(runtime_data)

## [1] 116 139  85 123 151 106

Movie genre

#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

#Let's have a look at the runtime
head(genre_data)

## [1] "\nDrama, Mystery, Sci-Fi            "    
## [2] "\nBiography, Drama, History            " 
## [3] "\nHorror, Thriller            "          
## [4] "\nAction, Adventure, Fantasy            "
## [5] "\nAction, Adventure, Sci-Fi            " 
## [6] "\nDrama, Romance            "

#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

#Convering each genre from text to factor
genre_data<-as.factor(genre_data)

#Let's have another look at the genre data
head(genre_data)

## [1] Drama     Biography Horror    Action    Action    Drama    
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror

IMDB rating

#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

#Let's have a look at the ratings
head(rating_data)

## [1] "7.9" "8.1" "5.6" "5.9" "6.4" "7.4"

#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

#Let's have another look at the ratings data
head(rating_data)

## [1] 7.9 8.1 5.6 5.9 6.4 7.4

Votes

#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

#Converting the votes data to text
votes_data <- html_text(votes_data_html)

#Let's have a look at the votes data
head(votes_data)

## [1] "722,867" "553,396" "43,192"  "701,331" "714,469" "268,236"

#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

#Let's have another look at the votes data
head(votes_data)

## [1] 722867 553396  43192 701331 714469 268236

Directors

#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

#Converting the directors data to text
directors_data <- html_text(directors_data_html)

#Let's have a look at the directors data
head(directors_data)

## [1] "Denis Villeneuve" "Mel Gibson"       "Damien Leone"     "David Ayer"      
## [5] "Zack Snyder"      "Thea Sharrock"

#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)

#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

#Let's have a look at the actors data
head(actors_data)

## [1] "Amy Adams"       "Andrew Garfield" "Jenna Kanell"    "Will Smith"     
## [5] "Ben Affleck"     "Emilia Clarke"

#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)

Metascore

#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')

#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)

#Let's have a look at the metascore data
head(metascore_data)

## [1] "81        " "71        " "40        " "44        " "51        "
## [6] "65        "

#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)

#Lets check the length of metascore data
length(metascore_data)

## [1] 95

Find metascore data with missing values and replace with NAs (this is an automated method)

library(stringr)
ratings_bar_data <- html_nodes(webpage, '.ratings-bar') %>%
  html_text2()
head(ratings_bar_data) # look at the ratings bar

## [1] "7.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.9/10 X \n81 Metascore"
## [2] "8.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 8.1/10 X \n71 Metascore"
## [3] "5.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.6/10 X "              
## [4] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
## [5] "6.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 6.4/10 X \n44 Metascore"
## [6] "7.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.4/10 X \n51 Metascore"

metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%
  str_match("(\\d{2})") %>%
  as.numeric() # convert to number

length(metascore_data)

## [1] 200

metascore_data

##   [1] 81 71 NA 40 44 51 65 26 94 81 74 70 78 85 62 65 25 48 NA 67 81 43 36 75 59
##  [26] 73 72 41 54 88 74 99 21 52 51 48 66 NA 96 68 72 66 76 71 66 57 42 60 51 77
##  [51] 38 66 90 61 65 58 58 44 56 59 55 32 40 28 52 NA 47 77 77 65 62 33 81 68 42
##  [76] 60 32 79 69 81 60 47 57 42 80 78 39 72 58 78 46 66 35 58 64 45 69 51 23 NA
## [101] 81 71 NA 40 44 51 65 26 94 81 74 70 78 85 62 65 25 48 NA 67 81 43 36 75 59
## [126] 73 72 41 54 88 74 99 21 52 51 48 66 NA 96 68 72 66 76 71 66 57 42 60 51 77
## [151] 38 66 90 61 65 58 58 44 56 59 55 32 40 28 52 NA 47 77 77 65 62 33 81 68 42
## [176] 60 32 79 69 81 60 47 57 42 80 78 39 72 58 78 46 66 35 58 64 45 69 51 23 NA

summary(metascore_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   47.00   61.00   60.05   72.75   99.00      10

Find the missing gross earnings (automated) Earnings are part of the votes bar in the html, scrape the votes bar and extract earnings with a regular expression to get the NAs in context

# scrape the votess bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
 html_text2()
head(votes_bar_data) # look at the votes bar data

## [1] "Votes: 722,867 | Gross: $100.55M" "Votes: 553,396 | Gross: $67.21M" 
## [3] "Votes: 43,192"                    "Votes: 701,331 | Gross: $325.10M"
## [5] "Votes: 714,469 | Gross: $330.36M" "Votes: 268,236 | Gross: $56.25M"

gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign
gross_data <- substring(gross_data,2,6) %>% # clean data: remove '$' sign
 as.numeric()
length(gross_data)

## [1] 100

Create a dataframe and inspect its structure

#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data, Description = description_data, Runtime = runtime_data, Genre = genre_data, Rating = rating_data, Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data, Director = directors_data, Actor = actors_data)

#Structure of the data frame
str(movies_df)

## 'data.frame':    200 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr  "Arrival" "Hacksaw Ridge" "Terrifier" "Suicide Squad" ...
##  $ Description         : chr  "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appea"| __truncated__ "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill peop"| __truncated__ "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ ...
##  $ Runtime             : num  116 139 85 123 151 106 108 111 128 107 ...
##  $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 7 4 8 1 1 7 1 7 5 3 ...
##  $ Rating              : num  7.9 8.1 5.6 5.9 6.4 7.4 8 6.5 8 7.6 ...
##  $ Metascore           : num  81 71 NA 40 44 51 65 26 94 81 ...
##  $ Votes               : num  722867 553396 43192 701331 714469 ...
##  $ Gross_Earning_in_Mil: num  100.5 67.2 NA 325.1 330.3 ...
##  $ Director            : Factor w/ 97 levels "Alex Proyas",..: 29 62 20 25 97 90 93 83 19 81 ...
##  $ Actor               : Factor w/ 90 levels "Aamir Khan","Adam Driver",..: 4 5 41 88 8 28 74 10 73 7 ...

Analyzing scraped data from the web

library('ggplot2')

qplot(data = movies_df,Runtime,fill = Genre,bins = 30)

## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Which movie from which Genre had the longest runtime?

#Load package and prepare data
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

long_runtime <- movies_df %>%
  arrange(desc(Runtime)) %>%
  head(10)

#Bar graph showing the movie and genre with the lowest runtime
ggplot(long_runtime, aes(x = str_wrap(Title, width = 15), y = Runtime, fill = Genre)) +
  geom_bar(stat = "identity") +
  labs(x = "Movies", y = "Runtime", title = "Longest Runtime by Movie and Genre") +
  theme_bw() +
  theme(axis.text.x = element_text(hjust = 0.5, vjust = 1, size = 8),
        plot.title = element_text(hjust = 0.5),
        panel.grid = element_blank()) +
  scale_fill_discrete(name = "Genre")

ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))

In the Runtime of 130-160 mins, which genre has the highest votes?

#Load package and prepare data
high_votes <- movies_df %>%
  filter(Runtime >=130 & Runtime <= 160) %>%
  group_by(Genre, Runtime, .groups = "drop") %>%
  summarize(vote_sum = sum(Votes)) %>%
  arrange(desc(vote_sum))

## `summarise()` has grouped output by 'Genre', 'Runtime'. You can override using
## the `.groups` argument.

#Scatterplot showing which genre has the highest votes in the Runtime of 130-160 mins
ggplot(high_votes, aes(x = Runtime, y = vote_sum)) +
  geom_point(aes(size = vote_sum, col = Genre)) +
  labs(title = "Genre with the Highest Votes in the Runtime of 130-160 Mins", x = "Runtime", y = "Votes") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.key = element_blank())

options(scipen = 999)

ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))

## Warning: Removed 18 rows containing missing values (`geom_point()`).

Across all genres, which genre has the highest average gross earnings in runtime 100 to 120?

high_gross <- movies_df %>%
  filter(Runtime >=100 & Runtime <= 120) %>%
  group_by(Genre, Runtime, .groups = "drop") %>%
  summarize(avg_gross = mean(Gross_Earning_in_Mil)) %>%
  arrange(desc(avg_gross))

## `summarise()` has grouped output by 'Genre', 'Runtime'. You can override using
## the `.groups` argument.

#Scatterplot showing which genre has the highest average gross earnings in runtime 100 to 120
ggplot(high_gross, aes(x = Runtime, y = avg_gross)) +
  geom_point(aes(size = avg_gross, col = Genre)) +
  labs(title = "Genres with the Highest Average Gross Earnings by Runtime", x = "Runtime", y = "Avg Gross Earnings") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.key = element_blank())

## Warning: Removed 1 rows containing missing values (`geom_point()`).

options(scipen = 999)

Web Scraping Tutorial

Shalanda Henderson

2023-06-28

Load Package

Set WD

Specifying the url for desired website to be scraped

Reading the HTML code from the website

Rankings

Titles

Descriptions

Movie runtime

Movie genre

IMDB rating

Votes

Directors

Metascore

Find metascore data with missing values and replace with NAs (this is an automated method)

Find the missing gross earnings (automated) Earnings are part of the votes bar in the html, scrape the votes bar and extract earnings with a regular expression to get the NAs in context

Create a dataframe and inspect its structure

Analyzing scraped data from the web

Which movie from which Genre had the longest runtime?

In the Runtime of 130-160 mins, which genre has the highest votes?

Across all genres, which genre has the highest average gross earnings in runtime 100 to 120?