DATA110_Hw9_WebScraping

Install necessary packages for this project

#install.packages('rvest')
#Loading the rvest package
library(rvest)

## Loading required package: xml2

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Scrape the IMDB website to create a dataframe of information from 2016 top 100 movies

#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

#Reading the HTML code from the website
webpage <- read_html(url)
# save_url(webpage, filename="webpage.html")

Load various elements and clean data using gsub.

Use the command, length, to ensure that each list contains 100 elements or NAs for missing data to sum to 100 elements.

#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

#Converting the ranking data to text
rank_data <- html_text(rank_data_html)

#Let's have a look at the rankings
head(rank_data)

## [1] "1." "2." "3." "4." "5." "6."

#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

#Let's have another look at the rankings
head(rank_data)

## [1] 1 2 3 4 5 6

#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)

#Let's have a look at the title
head(title_data)

## [1] "Suicide Squad"           "Deadpool"               
## [3] "In a Valley of Violence" "Brimstone"              
## [5] "Train to Busan"          "Hush"

title_data

##   [1] "Suicide Squad"                                   
##   [2] "Deadpool"                                        
##   [3] "In a Valley of Violence"                         
##   [4] "Brimstone"                                       
##   [5] "Train to Busan"                                  
##   [6] "Hush"                                            
##   [7] "Split"                                           
##   [8] "The Magnificent Seven"                           
##   [9] "Gods of Egypt"                                   
##  [10] "Hacksaw Ridge"                                   
##  [11] "Ghostbusters: Answer the Call"                   
##  [12] "Moana"                                           
##  [13] "Fantastic Beasts and Where to Find Them"         
##  [14] "Captain Fantastic"                               
##  [15] "Nocturnal Animals"                               
##  [16] "Batman v Superman: Dawn of Justice"              
##  [17] "Hidden Figures"                                  
##  [18] "The Conjuring 2"                                 
##  [19] "Me Before You"                                   
##  [20] "The Brothers Grimsby"                            
##  [21] "Batman: The Killing Joke"                        
##  [22] "Manchester by the Sea"                           
##  [23] "La La Land"                                      
##  [24] "Arrival"                                         
##  [25] "Rogue One: A Star Wars Story"                    
##  [26] "Pride and Prejudice and Zombies"                 
##  [27] "Don't Breathe"                                   
##  [28] "Hunt for the Wilderpeople"                       
##  [29] "The Autopsy of Jane Doe"                         
##  [30] "The Handmaiden"                                  
##  [31] "Miss Peregrine's Home for Peculiar Children"     
##  [32] "Captain America: Civil War"                      
##  [33] "Zootopia"                                        
##  [34] "Doctor Strange"                                  
##  [35] "The Invisible Guest"                             
##  [36] "Star Trek Beyond"                                
##  [37] "Moonlight"                                       
##  [38] "Your Name."                                      
##  [39] "The Girl on the Train"                           
##  [40] "Sing"                                            
##  [41] "Free State of Jones"                             
##  [42] "X-Men: Apocalypse"                               
##  [43] "Independence Day: Resurgence"                    
##  [44] "The Wailing"                                     
##  [45] "31"                                              
##  [46] "The Neon Demon"                                  
##  [47] "The Founder"                                     
##  [48] "Inferno"                                         
##  [49] "10 Cloverfield Lane"                             
##  [50] "Now You See Me 2"                                
##  [51] "Jason Bourne"                                    
##  [52] "Trolls"                                          
##  [53] "Lights Out"                                      
##  [54] "American Honey"                                  
##  [55] "The Love Witch"                                  
##  [56] "Passengers"                                      
##  [57] "Fences"                                          
##  [58] "13 Hours"                                        
##  [59] "The Boy"                                         
##  [60] "The Huntsman: Winter's War"                      
##  [61] "The Nice Guys"                                   
##  [62] "Ouija: Origin of Evil"                           
##  [63] "The Great Wall"                                  
##  [64] "Bastille Day"                                    
##  [65] "Allied"                                          
##  [66] "Lion"                                            
##  [67] "Midnight Special"                                
##  [68] "Gold"                                            
##  [69] "Hell or High Water"                              
##  [70] "War Dogs"                                        
##  [71] "Warcraft"                                        
##  [72] "Carrie Pilby"                                    
##  [73] "A Cure for Wellness"                             
##  [74] "The Accountant"                                  
##  [75] "Alice Through the Looking Glass"                 
##  [76] "Sausage Party"                                   
##  [77] "The Do-Over"                                     
##  [78] "Resident Evil: The Final Chapter"                
##  [79] "The Purge: Election Year"                        
##  [80] "Below Her Mouth"                                 
##  [81] "Dangal"                                          
##  [82] "Central Intelligence"                            
##  [83] "Underworld: Blood Wars"                          
##  [84] "The Legend of Tarzan"                            
##  [85] "A Silent Voice: The Movie"                       
##  [86] "Dirty Grandpa"                                   
##  [87] "The Lost City of Z"                              
##  [88] "Silence"                                         
##  [89] "The Jungle Book"                                 
##  [90] "Before I Wake"                                   
##  [91] "Deepwater Horizon"                               
##  [92] "Terrifier"                                       
##  [93] "Patriots Day"                                    
##  [94] "The Bad Batch"                                   
##  [95] "Assassin's Creed"                                
##  [96] "Teenage Mutant Ninja Turtles: Out of the Shadows"
##  [97] "Collateral Beauty"                               
##  [98] "Allegiant"                                       
##  [99] "Jack Reacher: Never Go Back"                     
## [100] "Nerve"

#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

#Let's have a look at the description data
head(description_data)

## [1] "\n    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [2] "\n    A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                              
## [3] "\n    A mysterious stranger and a random act of violence drag a town of misfits and nitwits into the bloody crosshairs of revenge."                                                       
## [4] "\n    From the moment the new reverend climbs the pulpit, Liz knows she and her family are in great danger."                                                                              
## [5] "\n    While a zombie virus breaks out in South Korea, passengers struggle to survive on the train from Seoul to Busan."                                                                   
## [6] "\n    A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence when a masked killer appears at her window."

#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

#Let's have another look at the description data 
head(description_data)

## [1] "    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [2] "    A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                              
## [3] "    A mysterious stranger and a random act of violence drag a town of misfits and nitwits into the bloody crosshairs of revenge."                                                       
## [4] "    From the moment the new reverend climbs the pulpit, Liz knows she and her family are in great danger."                                                                              
## [5] "    While a zombie virus breaks out in South Korea, passengers struggle to survive on the train from Seoul to Busan."                                                                   
## [6] "    A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence when a masked killer appears at her window."

length(description_data)

## [1] 100

#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

#Let's have a look at the runtime
head(runtime_data)

## [1] "123 min" "108 min" "104 min" "148 min" "118 min" "82 min"

#Data-Preprocessing: removing mins and converting it to numerical

runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

#Let's have another look at the runtime data
head(runtime_data)

## [1] 123 108 104 148 118  82

length(runtime_data)

## [1] 100

#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

#Let's have a look at the runtime
head(genre_data)

## [1] "\nAction, Adventure, Fantasy            "
## [2] "\nAction, Adventure, Comedy            " 
## [3] "\nAction, Western            "           
## [4] "\nDrama, Mystery, Thriller            "  
## [5] "\nAction, Horror, Thriller            "  
## [6] "\nHorror, Thriller            "

#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

#Convering each genre from text to factor
genre_data<-as.factor(genre_data)

#Let's have another look at the genre data
head(genre_data)

## [1] Action Action Action Drama  Action Horror
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror

length(genre_data)

## [1] 100

#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

#Let's have a look at the ratings
head(rating_data)

## [1] "6.0" "8.0" "6.0" "7.1" "7.6" "6.6"

#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

#Let's have another look at the ratings data
head(rating_data)

## [1] 6.0 8.0 6.0 7.1 7.6 6.6

length(rating_data)

## [1] 100

#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

#Converting the votes data to text
votes_data <- html_text(votes_data_html)

#Let's have a look at the votes data
head(votes_data)

## [1] "591,456" "888,759" "15,560"  "35,696"  "158,157" "100,176"

#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

#Let's have another look at the votes data
head(votes_data)

## [1] 591456 888759  15560  35696 158157 100176

length(votes_data)

## [1] 100

#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

#Converting the directors data to text
directors_data <- html_text(directors_data_html)

#Let's have a look at the directors data
head(directors_data)

## [1] "David Ayer"       "Tim Miller"       "Ti West"          "Martin Koolhoven"
## [5] "Sang-ho Yeon"     "Mike Flanagan"

#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)
length(directors_data)

## [1] 100

#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

#Let's have a look at the actors data
head(actors_data)

## [1] "Will Smith"         "Ryan Reynolds"      "Ethan Hawke"       
## [4] "Guy Pearce"         "Yoo Gong"           "John Gallagher Jr."

#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)
length(actors_data)

## [1] 100

Fill missing metascores with NAs using a for loop

#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')

#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)

#Let's have a look at the metascore 
head(metascore_data)

## [1] "40        " "65        " "64        " "45        " "72        "
## [6] "67        "

#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)

#Lets check the length of metascore data
length(metascore_data)

## [1] 96

for (i in c(21, 35, 81, 92)){
  
  a<-metascore_data[1:(i-1)]
  
  b<-metascore_data[i:length(metascore_data)]
  
  
  
  metascore_data<-append(a,list("NA"))
  
  metascore_data<-append(metascore_data,b)
  
  
  
}

#Data-Preprocessing: converting metascore to numerical
metascore_data<-as.numeric(metascore_data)

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

#Let's have another look at length of the metascore data

length(metascore_data)

## [1] 100

#Let's look at summary statistics
summary(metascore_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   45.75   59.50   58.83   72.00   99.00       4

Fill missing gross data with NAs using a for loop

#Using CSS selectors to scrape the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')

#Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)

#Let's have a look at the votes data
head(gross_data)

## [1] "$325.10M" "$363.07M" "$0.05M"   "$2.13M"   "$138.29M" "$93.43M"

#Data-Preprocessing: removing '$' and 'M' signs
gross_data<-gsub("M","",gross_data)

gross_data<-substring(gross_data,2,6)

#Let's check the length of gross data
length(gross_data)

## [1] 90

#Filling missing entries with NA
for (i in c(4,6,29,35,44,77,80,85,90,92)){

a <- gross_data[1:(i-1)]
b <- gross_data[i:length(gross_data)]
gross_data <- append(a, -1) # used -1 in place of NA's
gross_data <- append(gross_data, b)
}
gross_data <- na.exclude(gross_data)
gross_data <- gross_data[-c(101)]
gross_data <- as.numeric(gross_data)

#Let's have another look at the length of gross data
length(gross_data)

## [1] 100

summary(gross_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -1.000   6.615  44.285  78.455  98.267 532.100

Combine all the lists to form a data frame

movies_df<-data.frame(Rank = rank_data, Title = title_data,
                      
                      Description = description_data, Runtime = runtime_data,
                      
                      Genre = genre_data, Rating = rating_data,
                      
                      Metascore = metascore_data, Votes = votes_data,                                                             Gross_Earning_in_Mil = gross_data,
                      
                      Director = directors_data, Actor = actors_data)

#Structure of the data frame

str(movies_df)

## 'data.frame':    100 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr  "Suicide Squad" "Deadpool" "In a Valley of Violence" "Brimstone" ...
##  $ Description         : chr  "    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defens"| __truncated__ "    A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the"| __truncated__ "    A mysterious stranger and a random act of violence drag a town of misfits and nitwits into the bloody cross"| __truncated__ "    From the moment the new reverend climbs the pulpit, Liz knows she and her family are in great danger." ...
##  $ Runtime             : num  123 108 104 148 118 82 117 132 127 139 ...
##  $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 1 1 1 7 1 8 8 1 1 4 ...
##  $ Rating              : num  6 8 6 7.1 7.6 6.6 7.3 6.9 5.4 8.1 ...
##  $ Metascore           : num  40 65 64 45 72 67 62 54 25 71 ...
##  $ Votes               : num  591456 888759 15560 35696 158157 ...
##  $ Gross_Earning_in_Mil: num  325.1 363 0.05 -1 2.13 ...
##  $ Director            : Factor w/ 96 levels "Alex Proyas",..: 21 91 89 55 79 60 53 8 1 58 ...
##  $ Actor               : Factor w/ 89 levels "Aamir Khan","Adam Sandler",..: 88 71 30 35 89 41 37 20 12 5 ...

Question 1: Based on the above data, which movie from which Genre had the longest runtime?

**Answer: American Honey, Drama, 163 minutes ##### Add plotly to get more information on each bar segment

# p1 <- qplot(data = movies_df,Runtime,fill = Genre,bins = 30)
p1 <- movies_df %>%
  ggplot(aes(x=Runtime, fill = Genre)) +
  geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
  scale_fill_discrete(name = "Genre") +
  labs(title = "Top 100 Movies of 2016 Runtime by Genre")  
ggplotly(p1, tooltip = "all")

movies_df %>%
  rownames_to_column(var = "Name") %>% 
  filter(Runtime == max(Runtime))

##   Name Rank          Title
## 1   54   54 American Honey
##                                                                                                                                                                                                              Description
## 1     A teenage girl with nothing to lose joins a traveling magazine sales crew, and gets caught up in a whirlwind of hard partying, law bending and young love as she criss-crosses the Midwest with a band of misfits.
##   Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil      Director
## 1     163 Drama      7        80 37528                 0.66 Andrea Arnold
##        Actor
## 1 Sasha Lane

Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

**Answer: A Silent Voice: The Movie

p2 <- movies_df %>%
  ggplot(aes(x=Runtime,y=Rating))+
  geom_point(aes(size=Votes,col=Genre, text = paste("Movie Title:", title_data)), alpha = 0.7) +
  labs(title = "Top 100 Movies of 2016 Runtime by Ratings")

## Warning: Ignoring unknown aesthetics: text

ggplotly(p2)

movies_df %>%
  rownames_to_column(var = "Name") %>% 
  filter(Runtime == c(130,160)) %>%
  filter(Votes == max(Votes))

##   Name Rank                     Title
## 1   85   85 A Silent Voice: The Movie
##                                                                                                                                                          Description
## 1     A young man is ostracized by his classmates after he bullies a deaf girl to the point where she moves away. Years later, he sets off on a path for redemption.
##   Runtime     Genre Rating Metascore Votes Gross_Earning_in_Mil     Director
## 1     130 Animation    8.1        78 43275                   -1 Naoko Yamada
##        Actor
## 1 Miyu Irino

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

**Answer: Action $66.18 million

p3 <- movies_df %>%
  ggplot(aes(x=Runtime,y=Gross_Earning_in_Mil))+
  geom_point(aes(size = Rating,col = Genre), alpha = 0.5) +
  labs(title = "Top 100 Movies of 2016 Runtime by Gross Earnings in Millions") +
  scale_y_continuous("Gross Earnings in Millions", limits =c(-10, 600))
ggplotly(p3)

movies_df %>%
  rownames_to_column(var = "Name") %>% 
  filter(Runtime == c(100,120)) %>%
  group_by(Genre) %>%
  summarize(averageGross = mean(Gross_Earning_in_Mil)) %>%
  filter(averageGross == max(averageGross))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 1 x 2
##   Genre  averageGross
##   <fct>         <dbl>
## 1 Action         66.2

```

DATA110_Hw9_WebScraping

Elizabeth Click

11/1/2020

Install necessary packages for this project

Scrape the IMDB website to create a dataframe of information from 2016 top 100 movies

Load various elements and clean data using gsub.

Use the command, length, to ensure that each list contains 100 elements or NAs for missing data to sum to 100 elements.

Fill missing metascores with NAs using a for loop

Fill missing gross data with NAs using a for loop

Combine all the lists to form a data frame

Question 1: Based on the above data, which movie from which Genre had the longest runtime?

Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.