Web Scraping Lab Assignment
Using the Beginner’s Guide on Web Scraping in R (using rvest)
Step 1. Pull in packages. (rvest) Pull in the website to be scraped.
library('rvest')
## Warning: package 'rvest' was built under R version 3.5.3
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.5.3
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)
Step 2. Use CSS Selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')
#convert the ranking data to text.
rank_data <- html_text(rank_data_html)
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
Step 4. Scrape the title section and convert title data to text.
title_data_html <- html_nodes(webpage,'.lister-item-header a')
title_data <- html_text(title_data_html)
head(title_data)
## [1] "Suicide Squad" "Moonlight" "Rogue One" "The Handmaiden"
## [5] "Split" "La La Land"
Step 5. Scrape description data field and convert to text.
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
description_data <- html_text(description_data_html)
#remove the '\n'
description_data <- gsub("\n","",description_data)
head(description_data)
## [1] " A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [2] " A young African-American man grapples with his identity and sexuality while experiencing the everyday struggles of childhood, adolescence, and burgeoning adulthood."
## [3] " The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the Death Star plans."
## [4] " A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her."
## [5] " Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."
## [6] " While navigating their careers in Los Angeles, a pianist and an actress fall in love while attempting to reconcile their aspirations for the future."
Step 6. Scrape runtime section and convert to text.
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
runtime_data <- html_text(runtime_data_html)
#remove 'mins' and convert to numerical
runtime_data <- gsub(" min","",runtime_data)
runtime_data <- as.numeric(runtime_data)
head(runtime_data)
## [1] 123 111 133 145 117 128
Step 7. Scrape genre section and convert to text. Select only the first genre of each movie.
genre_data_html <- html_nodes(webpage,'.genre')
genre_data <- html_text(genre_data_html)
#remove the "\n" and excess spaces
genre_data <- gsub("\n","",genre_data)
genre_data <- gsub(" ","",genre_data)
#select only the first genre of each movie
genre_data <- gsub(",.*","",genre_data)
genre_data <- as.factor(genre_data)
head(genre_data)
## [1] Action Drama Action Drama Horror Comedy
## 9 Levels: Action Adventure Animation Biography Comedy Crime ... Mystery
Step 8. Scrape IMDB rating section and convert to text. Convert ratings to numerical.
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
rating_data <- html_text(rating_data_html)
rating_data <- as.numeric(rating_data)
head(rating_data)
## [1] 6.0 7.4 7.8 8.1 7.3 8.0
Step 9. Scrape the votes section and convert to text.
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
votes_data <- html_text(votes_data_html)
#remove commas and convert to numerical
votes_data <- gsub(",","",votes_data)
votes_data <- as.numeric(votes_data)
head(votes_data)
## [1] 551853 237211 492326 85398 378250 451278
Step 10. Scrapte directors section and convert to text.
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
directors_data <- html_text(directors_data_html)
#convert directors into factors
directors_data <- as.factor(directors_data)
head(directors_data)
## [1] David Ayer Barry Jenkins Gareth Edwards
## [4] Chan-wook Park M. Night Shyamalan Damien Chazelle
## 99 Levels: Adam Wingard Alex Proyas ... Zack Snyder
Step 11. Scrape the actors section and convert to text.
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
actors_data <- html_text(actors_data_html)
#convert actors data into factors
actors_data <- as.factor(actors_data)
head(actors_data)
## [1] Will Smith Mahershala Ali Felicity Jones Min-hee Kim
## [5] James McAvoy Ryan Gosling
## 90 Levels: Aamir Khan Alexander Skarsgård Amy Adams ... Zoey Deutch
Step 13. Scrape the gross revenue section and convert.
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
gross_data <- html_text(gross_data_html)
#remove "$" and "M" signs.
gross_data <- gsub("[^0-9]*","",gross_data)
#gross_data <- substring(gross_data,2,6)
head(gross_data)
## [1] "32510" "2785" "53218" "201" "13829" "15110"
length(gross_data)
## [1] 90
Fill in missing entries with NA.
for (i in c(29,33,40,41,43,72,74,75,76,100)){
a <- gross_data[1:(i-1)]
b <- gross_data[i:length(gross_data)]
gross_data <- append(a,list("NA"))
gross_data <- append(gross_data,b)}
#convert to numerical
unlist(gross_data)
## [1] "32510" "2785" "53218" "201" "13829" "15110" "6721" "36307"
## [9] "33036" "24876" "10055" "23404" "378" "133" "8626" "9343"
## [17] "5625" "5512" "4302" "15544" "34127" "078" "10001" "40808"
## [25] "502" "213" "12834" "1091" "NA" "23264" "5870" "1064"
## [33] "NA" "710" "8724" "3626" "8922" "7190" "15371" "NA"
## [41] "NA" "10247" "NA" "27040" "6618" "16243" "1263" "5174"
## [49] "6508" "11326" "588" "302" "023" "421" "4737" "15885"
## [57] "7540" "12664" "3434" "12507" "4770" "3115" "6268" "12744"
## [65] "10314" "1239" "520" "16961" "858" "2686" "018" "NA"
## [73] "9769" "NA" "NA" "NA" "36400" "4839" "5285" "2078"
## [81] "4554" "2683" "172" "36838" "1426" "3856" "6143" "6032"
## [89] "3098" "004" "066" "3554" "1427" "6727" "4601" "7921"
## [97] "5465" "4010" "1110" "NA" "1110"
gross_data <- gross_data[-c(101,102)]
gross_data <- as.numeric(gross_data)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
length(gross_data)
## [1] 100
summary(gross_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4 1426 5568 9289 12212 53218 10
Analyzing scraped data from the web.
Step 1. Pull in packages. Begin to plot the data.
library('ggplot2')
## Warning: package 'ggplot2' was built under R version 3.5.3
library('tidyverse')
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.3 v purrr 0.3.3
## v tidyr 1.0.0 v dplyr 0.8.3
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.3 v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)

Question #1: Based on the above data, which movie from which Genre had the longest runtime?
Step 2. Sort, filter, arrange - to get at the movie with the longest runtime.
movies_ordered <- movies_df %>%
arrange(desc(Runtime,Genre))
head(movies_ordered)
## Rank Title
## 1 91 American Honey
## 2 34 Silence
## 3 66 Dangal
## 4 100 The Wailing
## 5 9 Batman v Superman: Dawn of Justice
## 6 24 Captain America: Civil War
## Description
## 1 A teenage girl with nothing to lose joins a traveling magazine sales crew, and gets caught up in a whirlwind of hard partying, law bending and young love as she criss-crosses the Midwest with a band of misfits.
## 2 In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is rumored to have committed apostasy, and to propagate Catholicism.
## 3 Former wrestler Mahavir Singh Phogat and his two wrestler daughters struggle towards glory at the Commonwealth Games in the face of societal oppression.
## 4 Soon after a stranger arrives in a little village, a mysterious sickness starts spreading. A policeman, drawn into the incident, is forced to solve the mystery in order to save his daughter.
## 5 Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs.
## 6 Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man.
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil
## 1 163 Adventure 7.0 79 33006 66
## 2 161 Drama 7.1 79 88259 710
## 3 161 Action 8.4 NA 130707 1239
## 4 156 Horror 7.4 81 38128 NA
## 5 151 Action 6.5 44 588451 33036
## 6 147 Action 7.8 75 606316 40808
## Director Actor
## 1 Andrea Arnold Sasha Lane
## 2 Martin Scorsese Andrew Garfield
## 3 Nitesh Tiwari Aamir Khan
## 4 Hong-jin Na Jun Kunimura
## 5 Zack Snyder Ben Affleck
## 6 Anthony Russo Chris Evans
Answer #1: The movie “American Honey” had the highest runtime (163 minutes) and was in the Adventure Genre.
Step 3. Make more plots.
ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))

Question #2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?
Step 4. Pull the group of movies by the Runtime of interest. Arrange/sort by votes.
#runtime_sub <- movies_df %>%
#filter(Runtime > 129 | Runtime <161)
#runtime_sub
runtime_sub <- movies_df %>%
filter(Runtime <= 160 & Runtime >= 130) %>%
arrange(desc(Votes))
view(runtime_sub)
#runtime_sub <- subset(movies_df, Runtime = 130:160)
#runtime_sub
#runtime_sub <- filter(movies_df(Runtime %in% c(130:160)))
#runtime_sub
#I had a hard time selecting the movies with the right runtime.
Answer #2: In the Runtime subgroup of 130-160 minutes, the Action Genre received the most votes.
Step 5. Make more plots.
ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))
## Warning: Removed 10 rows containing missing values (geom_point).
