Load packages and libraries

#install.packages('rvest')
library('rvest')
library(tidyverse)
library(R.utils)   #  for insert()
library(ggplot2)
library(plotly)
library(RColorBrewer)

Read the HTML code from the IMDB website

url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)

Goal: to scrape the following data

Rank: The rank of the film from 1 to 100 on the list of 100 most popular feature films released in 2016.

Title: The title of the feature film.

Description: The description of the feature film.

Runtime: The duration of the feature film.

Genre: The genre of the feature film,

Rating: The IMDb rating of the feature film.

Metascore: The metascore on IMDb website for the feature film.

Votes: Votes cast in favor of the feature film.

Gross_Earning_in_Mil: The gross earnings of the feature film in millions.

Director: The main director of the feature film. Note, in case of multiple directors, I’ll take only the first.

Actor: The main actor in the feature film. Note, in case of multiple actors, I’ll take only the first.

Use the Selector Gadget to scrape Rank. Copy the CSS Selector.

Get all the rankings

# Paste the CSS selector in the html_nodes function

rank_data_html <- html_nodes(webpage,'.text-primary')

#Converting the ranking data to text
rank_data <- html_text(rank_data_html)
head(rank_data)

## [1] "1." "2." "3." "4." "5." "6."

Convert to numeric format

rank_data<-as.numeric(rank_data)
head(rank_data)

## [1] 1 2 3 4 5 6

Select titles. Copy the CSS selector. Scrape titles: paste CSS selector in the html_nodes function. Convert the title data to text.

title_data_html <- html_nodes(webpage,'.lister-item-header a')
title_data <- html_text(title_data_html)
head(title_data)

## [1] "Suicide Squad"           "Deadpool"               
## [3] "In a Valley of Violence" "Brimstone"              
## [5] "Train to Busan"          "Hush"

Select and scrape Description

description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
description_data <- html_text(description_data_html)
head(description_data)

## [1] "\n    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [2] "\n    A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                              
## [3] "\n    A mysterious stranger and a random act of violence drag a town of misfits and nitwits into the bloody crosshairs of revenge."                                                       
## [4] "\n    From the moment the new reverend climbs the pulpit, Liz knows she and her family are in great danger."                                                                              
## [5] "\n    While a zombie virus breaks out in South Korea, passengers struggle to survive on the train from Seoul to Busan."                                                                   
## [6] "\n    A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence when a masked killer appears at her window."

# Remove '\n'
description_data<-gsub("\n","",description_data)
head(description_data)

## [1] "    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [2] "    A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                              
## [3] "    A mysterious stranger and a random act of violence drag a town of misfits and nitwits into the bloody crosshairs of revenge."                                                       
## [4] "    From the moment the new reverend climbs the pulpit, Liz knows she and her family are in great danger."                                                                              
## [5] "    While a zombie virus breaks out in South Korea, passengers struggle to survive on the train from Seoul to Busan."                                                                   
## [6] "    A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence when a masked killer appears at her window."

Select and scrape Runtime

runtime_data_html <- html_nodes(webpage,'.runtime')
runtime_data <- html_text(runtime_data_html)
head(runtime_data)

## [1] "123 min" "108 min" "104 min" "148 min" "118 min" "82 min"

# Remove mins and convert to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
head(runtime_data)

## [1] 123 108 104 148 118  82

Select and scrape Genre

genre_data_html <- html_nodes(webpage,'.genre')
genre_data <- html_text(genre_data_html)
head(genre_data)

## [1] "\nAction, Adventure, Fantasy            "
## [2] "\nAction, Adventure, Comedy            " 
## [3] "\nAction, Western            "           
## [4] "\nDrama, Mystery, Thriller            "  
## [5] "\nAction, Horror, Thriller            "  
## [6] "\nHorror, Thriller            "

#removing \n
genre_data<-gsub("\n","",genre_data)
#removing excess spaces
genre_data<-gsub(" ","",genre_data)
#take only the first genre
genre_data<-gsub(",.*","",genre_data)
#convert genre from text to factor
genre_data<-as.factor(genre_data)
head(genre_data)

## [1] Action Action Action Drama  Action Horror
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror

Select and scrape Ratings

rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
rating_data <- html_text(rating_data_html)
head(rating_data)

## [1] "6.0" "8.0" "6.0" "7.1" "7.6" "6.6"

# convert ratings to numerical
rating_data<-as.numeric(rating_data)
head(rating_data)

## [1] 6.0 8.0 6.0 7.1 7.6 6.6

Select and scrape Votes

votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
votes_data <- html_text(votes_data_html)
head(votes_data)

## [1] "591,415" "888,690" "15,553"  "35,692"  "158,114" "100,138"

#removing commas
votes_data<-gsub(",","",votes_data)
#converting votes to numerical
votes_data<-as.numeric(votes_data)
head(votes_data)

## [1] 591415 888690  15553  35692 158114 100138

Select and scrape directors

directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
directors_data <- html_text(directors_data_html)
head(directors_data)

## [1] "David Ayer"       "Tim Miller"       "Ti West"          "Martin Koolhoven"
## [5] "Sang-ho Yeon"     "Mike Flanagan"

#converting directors data into factors
directors_data<-as.factor(directors_data)
head(directors_data)

## [1] David Ayer       Tim Miller       Ti West          Martin Koolhoven
## [5] Sang-ho Yeon     Mike Flanagan   
## 96 Levels: Alex Proyas Ana Lily Amirpour André Øvredal ... Zack Snyder

Select and scrape actors

actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
actors_data <- html_text(actors_data_html)
head(actors_data)

## [1] "Will Smith"         "Ryan Reynolds"      "Ethan Hawke"       
## [4] "Guy Pearce"         "Yoo Gong"           "John Gallagher Jr."

#converting actors data into factors
actors_data<-as.factor(actors_data)
head(actors_data)

## [1] Will Smith         Ryan Reynolds      Ethan Hawke        Guy Pearce        
## [5] Yoo Gong           John Gallagher Jr.
## 89 Levels: Aamir Khan Adam Sandler Alexander Skarsgård ... Yoo Gong

Select and scrape metascore

metascore_data_html <- html_nodes(webpage,'.metascore')
metascore_data <- html_text(metascore_data_html)
head(metascore_data)

## [1] "40        " "65        " "64        " "45        " "72        "
## [6] "67        "

#removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)
length(metascore_data)

## [1] 96

for metascores missing, identify them, and manually add NA to those missing scores - use insert() instead of than loop as in the tutorial

to use insert(), since each insertion adds a row, modify the next row number by subtracting 1 after each insertion

ats <- c(21,35,81,92)    # vector of insertion rows
ats <- ats - c(1:length(ats)-1)  #adjust the row number by the number of insertions
metascore_data<-insert(metascore_data, ats)   # inserts NA at ats()
#converting metascore to numerical
metascore_data<-as.numeric(metascore_data)
length(metascore_data)

## [1] 100

#summary statistics
summary(metascore_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   45.75   59.50   58.83   72.00   99.00       4

Select and scrape gross revenue

gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
gross_data <- html_text(gross_data_html)
head(gross_data)

## [1] "$325.10M" "$363.07M" "$0.05M"   "$2.13M"   "$138.29M" "$93.43M"

#removing '$' and 'M' signs
gross_data<-gsub("M","",gross_data)
gross_data<-substring(gross_data,2,6)
length(gross_data)

## [1] 90

for gross earning entries missing, manually enter NA; use insert() rather than loop

ats <- c(4,6,29,35,44,77,80,85,90,92)
ats <- ats - c(1:length(ats)-1)
gross_data <- insert(gross_data, ats)  # insert NAs
# convert to numerical
gross_data<-as.numeric(gross_data)
length(gross_data)

## [1] 100

summary(gross_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.02   12.49   52.30   87.28  101.92  532.10      10

Combine all the lists to form a data frame

movies_df<-data.frame(Rank=rank_data, Title=title_data, Description=description_data,
                      Runtime=runtime_data, Genre=genre_data, Rating=rating_data,
                      Metascore=metascore_data, Votes=votes_data,
                      Earning=gross_data,Director=directors_data,
                      Actor = actors_data)
str(movies_df)

## 'data.frame':    100 obs. of  11 variables:
##  $ Rank       : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title      : chr  "Suicide Squad" "Deadpool" "In a Valley of Violence" "Brimstone" ...
##  $ Description: chr  "    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defens"| __truncated__ "    A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the"| __truncated__ "    A mysterious stranger and a random act of violence drag a town of misfits and nitwits into the bloody cross"| __truncated__ "    From the moment the new reverend climbs the pulpit, Liz knows she and her family are in great danger." ...
##  $ Runtime    : num  123 108 104 148 118 82 117 132 127 139 ...
##  $ Genre      : Factor w/ 8 levels "Action","Adventure",..: 1 1 1 7 1 8 8 1 1 4 ...
##  $ Rating     : num  6 8 6 7.1 7.6 6.6 7.3 6.9 5.4 8.1 ...
##  $ Metascore  : num  40 65 64 45 72 67 62 54 25 71 ...
##  $ Votes      : num  591415 888690 15553 35692 158114 ...
##  $ Earning    : num  325.1 363 0.05 NA 2.13 ...
##  $ Director   : Factor w/ 96 levels "Alex Proyas",..: 21 91 89 55 79 60 53 8 1 58 ...
##  $ Actor      : Factor w/ 89 levels "Aamir Khan","Adam Sandler",..: 88 71 30 35 89 41 37 20 12 5 ...

Visualize the results

Plot 1:

p1 <- ggplot(data=movies_df, aes(x=Runtime, color=Genre, fill=Genre, bins=30)) +
geom_histogram(position="identity") +
scale_fill_brewer(palette = "Accent") + theme_classic()


ggplotly(p1)

Question 1: based on Plot 1, the longest time belongs to:

A drama movie runs for about 160 min.

Find the actual longest movie:

#long <- which(movies_df$Runtime==max(movies_df$Runtime))  # find the longest runtime movie
#print (paste("Rank:", movies_df$Rank[long], "- Title:", movies_df$Title[long], "- Runtime:", movies_df$Runtime[long], "min - Genre:", movies_df$Genre[long]))
# try just filter - more efficient
long <- filter(movies_df, Runtime==max(movies_df$Runtime))
print(paste("Rank:", long$Rank, "- Title:", long$Title, "- Runtime:", long$Runtime, "min - Genre:", long$Genre))

## [1] "Rank: 54 - Title: American Honey - Runtime: 163 min - Genre: Drama"

Plot 2:

p2 <- ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes, col=Genre)) +
scale_color_brewer(palette = "Accent") + theme_grey() +
scale_size(range=c(1,10))

ggplotly(p2)

Question 2: based on Plot 2, within runtime 130-160, the highest votes belong to:

Genre: Action - has the largest bubble diam in that timeframe

Find the movie:

mxvotes <- filter(movies_df, Runtime>=130 & Runtime<=160) %>% filter (Votes == max(Votes))
print (paste("Rank:", mxvotes$Rank, "- Title:", mxvotes$Title, "- Runtime:", mxvotes$Runtime, "min - Genre:", mxvotes$Genre, "- Votes:", mxvotes$Votes))

## [1] "Rank: 32 - Title: Captain America: Civil War - Runtime: 147 min - Genre: Action - Votes: 650796"

Plot 3

p3 <- ggplot(movies_df,aes(x=Runtime,y=Earning))+
geom_point(aes(size=Rating,col=Genre), alpha=.8) +
scale_color_brewer(palette = "Set1") + theme_grey() +
scale_size(range=c(1,6))

ggplotly(p3)

Question 3: based on Plot 3, within runtime 100-120, the highest earnings belong to:

Genres= Adventure and Action have the highest earnings.

Find the movie with the absolute highest earning:

mxearn <- filter(movies_df, Runtime>=100 & Runtime<=120 & !is.na(Earning)) %>% filter(Earning == max(Earning))
print (paste("Rank:", mxearn$Rank, "- Title:", mxearn$Title, "- Runtime:", mxearn$Runtime, "min - Genre:", mxearn$Genre, "- Votes:", mxearn$Votes, "- Earnings:", mxearn$Earning))

## [1] "Rank: 89 - Title: The Jungle Book - Runtime: 106 min - Genre: Adventure - Votes: 256311 - Earnings: 364"

Web Scraping

Raul Miranda

10/31/2020