Web_Scraping

library(rvest)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.3     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

webpage <- read_html(url)

rank_data <- html_nodes(webpage,'.text-primary')
rank_data <- html_text(rank_data)

#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

head(rank_data)

## [1] 1 2 3 4 5 6

#Scraping the title data

title_data <- html_nodes(webpage, '.lister-item-header a')
title_data <- html_text(title_data)

head(title_data)

## [1] "Terrifier"       "Suicide Squad"   "Silence"         "Hush"           
## [5] "The Conjuring 2" "Split"

length(title_data)

## [1] 100

#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

description_data <- html_text(description_data_html)

#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

head(description_data)

## [1] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."                                                                       
## [2] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [3] "In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is rumored to have committed apostasy, and to propagate Catholicism."  
## [4] "A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence when a masked killer appears at her window."                         
## [5] "Ed and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit."                                     
## [6] "Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."

length(description_data)

## [1] 100

#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
#Data-Preprocessing: removing mins and converting it to numerical

runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

head(runtime_data)

## [1]  85 123 161  82 134 117

length(runtime_data)

## [1] 100

#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

#Convering each genre from text to factor
genre_data<-as.factor(genre_data)

length(genre_data)

## [1] 100

head(genre_data)

## [1] Horror Action Drama  Horror Horror Horror
## 9 Levels: Action Adventure Animation Biography Comedy Crime Drama ... Horror

#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

head(rating_data)

## [1] 5.6 5.9 7.2 6.6 7.3 7.3

length(rating_data)

## [1] 100

#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

#Converting the votes data to text
votes_data <- html_text(votes_data_html)

#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

head(votes_data)

## [1]  47581 710148 119422 149186 292174 532772

length(votes_data)

## [1] 100

#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

#Converting the directors data to text
directors_data <- html_text(directors_data_html)

#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)

head(directors_data)

## [1] Damien Leone       David Ayer         Martin Scorsese    Mike Flanagan     
## [5] James Wan          M. Night Shyamalan
## 97 Levels: Alessandro Carloni Alex Proyas Ana Lily Amirpour ... Zack Snyder

length(directors_data)

## [1] 100

#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

for (i in c(99)){

a<-actors_data[1:(i-1)]

b<-actors_data[i:length(actors_data)]

actors_data<-append(a,list("NA"))

actors_data<-append(actors_data,b)
}
#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(unlist(actors_data))


actors_data

##   [1] Jenna Kanell         Will Smith           Andrew Garfield     
##   [4] John Gallagher Jr.   Vera Farmiga         James McAvoy        
##   [7] Andrew Garfield      Kim Min-hee          Ryan Gosling        
##  [10] Matthew McConaughey  Amy Adams            Auli'i Cravalho     
##  [13] Anna Kendrick        Ryan Reynolds        Gong Yoo            
##  [16] Russell Crowe        Felicity Jones       Chris Pine          
##  [19] Eva Green            Brian Cox            Ginnifer Goodwin    
##  [22] Ryunosuke Kamiki     Denzel Washington    Amy Adams           
##  [25] Melissa McCarthy     Alexander Skarsgård  Emilia Clarke       
##  [28] Jennifer Lawrence    Elizabeth Reaser     Taraji P. Henson    
##  [31] Stephen Lang         Jun Kunimura         Eddie Redmayne      
##  [34] Casey Affleck        Kate Bosworth        Teresa Palmer       
##  [37] John Goodman         Tom Cruise           Seth Rogen          
##  [40] Michelle Williams    John Krasinski       Samantha Robinson   
##  [43] Natalya Anisimova    Ben Affleck          Ben Affleck         
##  [46] Chris Evans          Erika Linder         Liam Hemsworth      
##  [49] Brenton Thwaites     Michael Keaton       James McAvoy        
##  [52] Elle Fanning         Mahershala Ali       Benedict Cumberbatch
##  [55] Matt Damon           Neel Sethi           Jonah Hill          
##  [58] Garance Marillier    Nia Vardalos         Emily Blunt         
##  [61] Sacha Baron Cohen    Viggo Mortensen      Travis Fimmel       
##  [64] Mario Casas          Lily James           John Gallagher Jr.  
##  [67] Dane DeHaan          Mark Rylance         Keanu Reeves        
##  [70] Matthew McConaughey  Sam Neill            Dev Patel           
##  [73] Matthew McConaughey  Hailee Steinfeld     Suki Waterhouse     
##  [76] Emma Roberts         Malcolm McDowell     Chloë Grace Moretz  
##  [79] Mark Wahlberg        Charlie Hunnam       Matt Damon          
##  [82] Lauren Cohan         Michael Fassbender   Jesse Eisenberg     
##  [85] Mia Wasikowska       Louis C.K.           Mila Kunis          
##  [88] Brad Pitt            Robert De Niro       Jason Statham       
##  [91] Jack Black           Benjamin Walker      Blake Lively        
##  [94] Frank Grillo         Aaron Poole          Summer H. Howell    
##  [97] Chris Pine           Denzel Washington    NA                  
## [100] Tom Hanks           
## 90 Levels: Aaron Poole Alexander Skarsgård Amy Adams ... Will Smith

length(actors_data)

## [1] 100

#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')

#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)

#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)

for (i in c(1,44,65,97)){

a<-metascore_data[1:(i-1)]

b<-metascore_data[i:length(metascore_data)]

metascore_data<-append(a,list("NA"))

metascore_data<-append(metascore_data,b)
}

#Data-Preprocessing: converting metascore to numerical
metascore_data<-as.numeric(unlist(metascore_data))

## Warning: NAs introduced by coercion

head(metascore_data)

## [1] 40 NA 40 79 67 65

length(metascore_data)

## [1] 100

#Let's look at summary statistics
summary(metascore_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   46.75   61.50   60.15   72.25   99.00       4

#Using CSS selectors to scrape the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')

#Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)

gross_data<-gsub("M","",gross_data)

gross_data<-substring(gross_data,2,6)

for (i in c(1,2,39,40,50,52,59,90)){

a<-gross_data[1:(i-1)]

b<-gross_data[i:length(gross_data)]

gross_data<-append(a,list("NA"))

gross_data<-append(gross_data,b)

}

#Data-Preprocessing: converting gross to numerical
gross_data<-as.numeric(gross_data)

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') |>
html_text2()

head(votes_bar_data)

## [1] "Votes: 47,581"                    "Votes: 710,148 | Gross: $325.10M"
## [3] "Votes: 119,422 | Gross: $7.10M"   "Votes: 149,186"                  
## [5] "Votes: 292,174 | Gross: $102.47M" "Votes: 532,772 | Gross: $138.29M"

gross_data <- str_match(votes_bar_data, "\\$.+$") 
gross_data <- gsub("M","",gross_data)
gross_data <- substring(gross_data,2,6) |>
  as.numeric()
length(gross_data)

## [1] 100

summary(gross_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.01   15.50   55.87   93.09  122.05  532.10      10

#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data,

Description = description_data, Runtime = runtime_data,

Genre = genre_data, Rating = rating_data,

Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data,

Director = directors_data, Actor = actors_data)

#Structure of the data frame


str(movies_df)

## 'data.frame':    100 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr  "Terrifier" "Suicide Squad" "Silence" "Hush" ...
##  $ Description         : chr  "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ "In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is"| __truncated__ "A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence "| __truncated__ ...
##  $ Runtime             : num  85 123 161 82 134 117 139 145 128 108 ...
##  $ Genre               : Factor w/ 9 levels "Action","Adventure",..: 9 1 7 9 9 9 4 7 5 3 ...
##  $ Rating              : num  5.6 5.9 7.2 6.6 7.3 7.3 8.1 8.1 8 7.1 ...
##  $ Metascore           : num  40 NA 40 79 67 65 63 71 85 94 ...
##  $ Votes               : num  47581 710148 119422 149186 292174 ...
##  $ Gross_Earning_in_Mil: num  NA 325.1 7.1 NA 102.4 ...
##  $ Director            : Factor w/ 97 levels "Alessandro Carloni",..: 19 22 61 65 44 59 63 71 18 34 ...
##  $ Actor               : Factor w/ 90 levels "Aaron Poole",..: 39 90 4 42 88 37 4 49 73 60 ...

library('ggplot2')

qplot(data = movies_df,Runtime,fill = Genre,bins = 30)

## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Question 1: Based on the above data, which movie from which Genre had the longest runtime?

Using select and arrange functions, we are able to see that “Silence” had the longest runtime and it is in the Drama category.

long_runtime <- movies_df |>
  select(Title, Runtime, Genre) |>
  arrange(desc(Runtime))

head(long_runtime)

##                                Title Runtime  Genre
## 1                            Silence     161  Drama
## 2                        The Wailing     156  Drama
## 3 Batman v Superman: Dawn of Justice     151 Action
## 4         Captain America: Civil War     147 Action
## 5                A Cure for Wellness     146  Drama
## 6                     The Handmaiden     145  Drama

ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))

Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

The Action category has the most votes between the 130-160 min range.

range_votes <- movies_df |>
  filter(Runtime >= 130 | Runtime <= 160) |>
  group_by(Genre)|>
  mutate(genre_votes = sum(Votes)) |>
  distinct(Genre, genre_votes) |>
  arrange(desc(genre_votes))
  
  

range_votes

## # A tibble: 9 × 2
## # Groups:   Genre [9]
##   Genre     genre_votes
##   <fct>           <dbl>
## 1 Action       10104107
## 2 Drama         3583526
## 3 Animation     2062904
## 4 Biography     1834880
## 5 Horror        1614079
## 6 Adventure     1412673
## 7 Comedy        1321686
## 8 Crime          952645
## 9 Fantasy          4598

ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))

## Warning: Removed 10 rows containing missing values (`geom_point()`).

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

The code chunk below shows us that Animation had the highes average gross earnings in the 100-120 min runtime range

earnings_by_genre <- movies_df |>
  filter(Runtime >= 100 | Runtime <= 120) |>
  group_by(Genre)|>
  mutate(genre_earnings = mean(Gross_Earning_in_Mil, na.rm = TRUE)) |>
  distinct(Genre, genre_earnings) |>
  arrange(desc(genre_earnings))

earnings_by_genre

## # A tibble: 9 × 2
## # Groups:   Genre [9]
##   Genre     genre_earnings
##   <fct>              <dbl>
## 1 Animation          204. 
## 2 Adventure          119. 
## 3 Action             117. 
## 4 Biography           68.1
## 5 Crime               63.9
## 6 Comedy              54.3
## 7 Horror              39.6
## 8 Drama               36.4
## 9 Fantasy            NaN

Web_Scraping

Brian Caceres

2023-10-28

Question 1: Based on the above data, which movie from which Genre had the longest runtime?

Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.