Web Scrapping

Author

Arif

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest)

Attaching package: 'rvest'

The following object is masked from 'package:readr':

    guess_encoding
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)
#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

#Converting the ranking data to text
rank_data <- html_text(rank_data_html)

#Let's have a look at the rankings
head(rank_data)
[1] "1." "2." "3." "4." "5." "6."
#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

#Let's have another look at the rankings
head(rank_data)
[1] 1 2 3 4 5 6
#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)

#Let's have a look at the title
head(title_data)
[1] "Arrival"                            "Hacksaw Ridge"                     
[3] "Terrifier"                          "Suicide Squad"                     
[5] "Batman v Superman: Dawn of Justice" "Me Before You"                     
#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

#Let's have a look at the description data
head(description_data)
[1] "\nA linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."                                                                                  
[2] "\nWorld War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
[3] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                        
[4] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                 
[5] "\nFearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."                                                       
[6] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."                                                                                                                   
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

#Let's have another look at the description data 
head(description_data)
[1] "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."                                                                                  
[2] "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
[3] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                                                        
[4] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                 
[5] "Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."                                                       
[6] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."                                                                                                                   
#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

#Let's have a look at the runtime
head(runtime_data)
[1] "116 min" "139 min" "85 min"  "123 min" "151 min" "106 min"
#Data-Preprocessing: removing mins and converting it to numerical

runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

#Let's have another look at the runtime data
head(runtime_data)
[1] 116 139  85 123 151 106
#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

#Let's have a look at the runtime
head(genre_data)
[1] "\nDrama, Mystery, Sci-Fi            "    
[2] "\nBiography, Drama, History            " 
[3] "\nHorror, Thriller            "          
[4] "\nAction, Adventure, Fantasy            "
[5] "\nAction, Adventure, Sci-Fi            " 
[6] "\nDrama, Romance            "            
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

#Convering each genre from text to factor
genre_data<-as.factor(genre_data)

#Let's have another look at the genre data
head(genre_data)
[1] Drama     Biography Horror    Action    Action    Drama    
Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

#Let's have a look at the ratings
head(rating_data)
[1] "7.9" "8.1" "5.6" "5.9" "6.4" "7.4"
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

#Let's have another look at the ratings data
head(rating_data)
[1] 7.9 8.1 5.6 5.9 6.4 7.4
#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

#Converting the votes data to text
votes_data <- html_text(votes_data_html)

#Let's have a look at the votes data
head(votes_data)
[1] "722,916" "553,431" "43,200"  "701,352" "714,484" "268,259"
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

#Let's have another look at the votes data
head(votes_data)
[1] 722916 553431  43200 701352 714484 268259
#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

#Converting the directors data to text
directors_data <- html_text(directors_data_html)

#Let's have a look at the directors data
head(directors_data)
[1] "Denis Villeneuve" "Mel Gibson"       "Damien Leone"     "David Ayer"      
[5] "Zack Snyder"      "Thea Sharrock"   
#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)

#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

#Let's have a look at the actors data
head(actors_data)
[1] "Amy Adams"       "Andrew Garfield" "Jenna Kanell"    "Will Smith"     
[5] "Ben Affleck"     "Emilia Clarke"  
#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)
length(actors_data)
[1] 100
library(stringr)
ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>%
# scrape the ratings bar and convert to text
html_text2()
head(ratings_bar_data)  # look at the ratings bar
[1] "7.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.9/10 X \n81 Metascore"
[2] "8.1\nRate this\n 1 2 3 4 5 6 7 8 9 10 8.1/10 X \n71 Metascore"
[3] "5.6\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.6/10 X "              
[4] "5.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 5.9/10 X \n40 Metascore"
[5] "6.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 6.4/10 X \n44 Metascore"
[6] "7.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.4/10 X \n51 Metascore"
metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%
# extract Metascore
  
str_match("\\d{2}") %>% as.numeric()    # convert to number
length(metascore_data)
[1] 100
metascore_data
  [1] 81 71 NA 40 44 51 65 26 94 81 74 70 78 85 62 65 25 48 NA 67 81 43 36 75 59
 [26] 73 72 41 54 88 74 99 21 52 51 48 66 NA 96 68 72 66 76 71 66 57 42 60 51 77
 [51] 38 66 90 61 65 58 58 44 56 59 55 32 40 28 52 NA 47 77 77 65 62 33 81 68 42
 [76] 60 32 79 69 81 60 47 57 42 80 78 39 72 58 78 46 66 35 58 64 45 69 51 23 NA
summary(metascore_data)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  21.00   47.00   61.00   60.05   72.50   99.00       5 
# scrape the votess bar and convert to text

votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>%
 html_text2()

head(votes_bar_data) # look at the votes bar data
[1] "Votes: 722,916 | Gross: $100.55M" "Votes: 553,431 | Gross: $67.21M" 
[3] "Votes: 43,200"                    "Votes: 701,352 | Gross: $325.10M"
[5] "Votes: 714,484 | Gross: $330.36M" "Votes: 268,259 | Gross: $56.25M" 
gross_data <- str_match(votes_bar_data, "\\$.+$") # extract the gross earnings
gross_data <- gsub("M","",gross_data) # clean data: remove 'M' sign
gross_data <- substring(gross_data,2,6) %>% # clean data: remove '$' sign
as.numeric()
length(gross_data)
[1] 100
#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data,

Description = description_data, Runtime = runtime_data,

Genre = genre_data, Rating = rating_data,

Metascore = metascore_data, Votes = votes_data,                                                             Gross_Earning_in_Mil = gross_data,

Director = directors_data, Actor = actors_data)

#Structure of the data frame

str(movies_df)
'data.frame':   100 obs. of  11 variables:
 $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
 $ Title               : chr  "Arrival" "Hacksaw Ridge" "Terrifier" "Suicide Squad" ...
 $ Description         : chr  "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appea"| __truncated__ "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill peop"| __truncated__ "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ ...
 $ Runtime             : num  116 139 85 123 151 106 108 111 128 107 ...
 $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 7 4 8 1 1 7 1 7 5 3 ...
 $ Rating              : num  7.9 8.1 5.6 5.9 6.4 7.4 8 6.5 8 7.6 ...
 $ Metascore           : num  81 71 NA 40 44 51 65 26 94 81 ...
 $ Votes               : num  722916 553431 43200 701352 714484 ...
 $ Gross_Earning_in_Mil: num  100.5 67.2 NA 325.1 330.3 ...
 $ Director            : Factor w/ 97 levels "Alex Proyas",..: 29 62 20 25 97 90 93 83 19 81 ...
 $ Actor               : Factor w/ 90 levels "Aamir Khan","Adam Driver",..: 4 5 41 88 8 28 74 10 73 7 ...
library('ggplot2')
plot1 <- movies_df
  qplot(data = movies_df,Runtime,fill = Genre,bins = 30)+
  scale_fill_discrete(name ="Genre") +
  labs(title = "100 Most Popular Feautre Film in 2016 Runtime by Genre") +
  geom_histogram(position="identity", alpha=0.5, binwidth = 5, color ="white")
Warning: `qplot()` was deprecated in ggplot2 3.4.0.

Question 1: Based on the above data, which movie from which Genre had the longest runtime?

Answer: Batman v. Superman: Dawn of Justice (Ultimate Edition) Genre: Action

movies_df1 <- select(movies_df, Title, Genre, Runtime)
movies_df1[which.max(movies_df1$Runtime),]
                                                   Title  Genre Runtime
19 Batman v Superman: Dawn of Justice (Ultimate Edition) Action     182

Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

Answer: Captain America: Civil War (Genre: Action) with a votes of 815574.

ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre, text = paste("Movie Title:", title_data)), alpha = 0.7) +
  labs(title = " 100 Most Popular Feature Film in 2016 Runtime by Ratings")
Warning in geom_point(aes(size = Votes, col = Genre, text = paste("Movie
Title:", : Ignoring unknown aesthetics: text

movie2_df2 <- select(movies_df, Title, Genre, Runtime, Votes)
movie2_df2 <- filter(movie2_df2, Runtime >= 130 & Runtime <= 160)
movie2_df2[which.max(movie2_df2$Votes),]
                       Title  Genre Runtime  Votes
5 Captain America: Civil War Action     147 815574

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

Answer: The genre animation has the highest average gross earnings in runtime 100 to 120. The gross earnings in mil was 216.33.

ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre), alpha = 0.5) +
  labs(title = " 100 Most Popular Feature Film in 2016 Gross Earnings in Millions") +
  scale_y_continuous("Gross Earnings in Millions", limits = c(-10, 600))
Warning: Removed 9 rows containing missing values (`geom_point()`).

movie2_df3 <- select(movies_df, Title, Genre, Runtime, Gross_Earning_in_Mil)
movie2_df3 <- filter(movie2_df3, Runtime>=100 & Runtime <=120)
Genre_mean <- movie2_df3 %>%
  group_by(Genre) %>%                                       
   summarise_at(vars(Gross_Earning_in_Mil), mean, na.rm = TRUE)
Genre_mean[which.max(Genre_mean$Gross_Earning_in_Mil),]
# A tibble: 1 × 2
  Genre     Gross_Earning_in_Mil
  <fct>                    <dbl>
1 Animation                 216.