Webscraping HW

Author

A. Diaz Nova

Introduction

As a data scientist, the web is a good place to find data and information, which is essential to have a skill to scrape your own data. Eventually you’ll be able to build a handy model for cases where data is not easily readible. Be it through an unstructured format, we should be able to grow knowledge and expertise to use data that can be beneficial all around.

Load in necessary packages/libraries

library('rvest')
library(stringr)
library(ggplot2)
library(dplyr)
library(plotly)

# Specifying the url for desired website to be scrape
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

# Reading the HTML code from the website
webpage <- read_html(url)

Using R code to get all the rankings

# Using CSS selectors to scrape the rankings section 
rank_data_html <- html_nodes(webpage,'.text-primary')

# Converting the ranking data to text
rank_data <- html_text(rank_data_html)

# Let's have a look at the rankings
head(rank_data)

character(0)

The data we have must be formatted correctly. In other words, convert it to numerical format (Preprocessing step)

# Data-Preprocessing: Converting rankings to numerical
rank_data <- as.numeric(rank_data)

# Let's have another look at the rankings
head(rank_data)

numeric(0)

Select all the titles using R code

# Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')

# Converting the title data to text
title_data <- html_text(title_data_html)

# Let's have a look at the title
head(title_data)

character(0)

Now, we will scrape the rest of interested categories through R coding - Description

# Follow the same steps as before

# 1) Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

# 2) Converting the description data to text 
description_data <- html_text(description_data_html)

# 3) Let's have a look at the description data
head(description_data)

character(0)

In a seperate chunk, remove all the unnecessary stuff (Preprocessing step)

# Data-Preprocessing: removing '\n'
description_data <- gsub("\n", "",description_data)

# Let's have another look at the description data
head(description_data)

character(0)

Using R code to scrape for runtime

# 1) Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

# 2) Converting the runtime to text
runtime_data <- html_text(runtime_data_html)

# 3) Let's have a look at the runtime
head(runtime_data)

character(0)

# 4) Data-Preprocessing: removing mins and converting it to numerical
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

# 5) Let's have another look at the runtime data
head(runtime_data)

numeric(0)

Using R code to scrape for Movie genre

# 1) Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

# 2) Converting the genre data to text
genre_data <- html_text(genre_data_html)

# 3) Let's have a look at the runtime
head(genre_data)

character(0)

# 4) Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

# 5) Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

# 6) taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

# 7) Convering each genre from text to factor
genre_data<-as.factor(genre_data)

# 8) Let's have another look at the genre data
head(genre_data)

factor()
Levels:

Using R code to scrape for IMDB rating

# 1) Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

# 2) Converting the ratings data to text
rating_data <- html_text(rating_data_html)

# 3) Let's have a look at the ratings
head(rating_data)

character(0)

# 4) Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

# 5) Let's have another look at the ratings data
head(rating_data)

numeric(0)

Using R code to scrape for number of votes

# 1) Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

# 2) Converting the votes data to text
votes_data <- html_text(votes_data_html)

# 3) Let's have a look at the votes data
head(votes_data)

character(0)

# 4) Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

# 5) Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

# 6) Let's have another look at the votes data
head(votes_data)

numeric(0)

Using R code to scrape for director and their name

# 1) Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

# 2) Converting the directors data to text
directors_data <- html_text(directors_data_html)

# 3) Let's have a look at the directors data
head(directors_data)

character(0)

# 4) Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)

Using R code to scrape for actors and their names

# 1) Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

# 2) Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

# 3) Let's have a look at the actors data
head(actors_data)

character(0)

# 4) Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)

Using R code to scrape for Metascore ratings

ratings_bar_data <- html_nodes(webpage,'.ratings-bar') |>

# scrape the ratings bar and convert to text
 html_text2()

head(ratings_bar_data)

character(0)

# look at the ratings bar

metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") |>
# extract Metascore
 
str_match("\\d{2}") |>
 as.numeric() 
# convert to number

length(metascore_data)

[1] 0

metascore_data

numeric(0)

summary(metascore_data)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.

Using R code to scrape for Gross variable

# scrape the votes bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') |>
  html_text2()

head(votes_bar_data)

character(0)

# look at the votes bar data

gross_data <- str_match(votes_bar_data, "\\$.+$") 
# extract the gross earnings

gross_data <- gsub("M","",gross_data) 
# clean data: remove 'M' sign

gross_data <- substring(gross_data,2,6) |> 
  # clean data: remove '$' sign
 as.numeric()

length(gross_data)

[1] 0

Creating a dataframe based off all the work done so far

# Combining all the lists to form a data frame
movies_df <- data.frame(Rank = rank_data, Title = title_data, Description = description_data, Runtime = runtime_data, Genre = genre_data, Rating = rating_data, Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data, Director = directors_data)

#Structure of the data frame
str(movies_df)

'data.frame':   0 obs. of  10 variables:
 $ Rank                : num 
 $ Title               : chr 
 $ Description         : chr 
 $ Runtime             : num 
 $ Genre               : Factor w/ 0 levels: 
 $ Rating              : num 
 $ Metascore           : num 
 $ Votes               : num 
 $ Gross_Earning_in_Mil: num 
 $ Director            : Factor w/ 0 levels:

First visualization

Question 1: Based on the visual, which movie from which Genre had the longest runtime

p1 <- movies_df |>
  ggplot(aes(Runtime, fill = Genre, bins = 30))+
  geom_histogram() +
  scale_color_brewer(palette = "Reds")
p1 <- ggplotly(p1)
p1

Q1 <- movies_df |>
  filter(Genre == "Drama", Runtime > 150)
head(Q1)

 [1] Rank                 Title                Description         
 [4] Runtime              Genre                Rating              
 [7] Metascore            Votes                Gross_Earning_in_Mil
[10] Director            
<0 rows> (or 0-length row.names)

ANSWER: Drama had the longest runtime, and the name of movie was, “Silence”.

Second Visualization

Question 2: Based on the next visual, in the Runtime of 130-160 mins, which genre has the highest votes?

p2 <- movies_df|>
  ggplot(aes(x = Runtime, y = Rating)) +
  geom_point(aes(size = Votes, col = Genre))
p2 <- ggplotly(p2)
p2

summary(movies_df$Votes)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.

Q2 <- movies_df |>
  filter(Runtime >= 130 & Runtime <= 160) |>
  group_by(Genre)|>
  summarise(max_votes = max(Votes)) |>
  arrange(desc(max_votes))

Warning: There was 1 warning in `summarise()`.
ℹ In argument: `max_votes = max(Votes)`.
Caused by warning in `max()`:
! no non-missing arguments to max; returning -Inf

Q2

# A tibble: 0 × 2
# ℹ 2 variables: Genre <fct>, max_votes <dbl>

ANSWER: A landslide honestly, Action is top of the group compared to the rest of the other genres.

Third Visualization

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

p3 <- movies_df |>
  ggplot(aes(x = Runtime, y = Gross_Earning_in_Mil)) +
  geom_point(aes(size = Rating, col = Genre))
p3 <- ggplotly(p3)
p3

summary(movies_df$Gross_Earning_in_Mil)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.

Q3 <- movies_df |>
  filter(Runtime >= 100 & Runtime <= 120) |>
  group_by(Genre)|>
  summarise(avg_gross = mean(Gross_Earning_in_Mil)) |>
  arrange(desc(avg_gross))
Q3

# A tibble: 0 × 2
# ℹ 2 variables: Genre <fct>, avg_gross <dbl>

ANSWER: Lastly, across all genres where runtime is between 100 to 120, animation reigns supreme over the other genres involved.