Web Scraping Lab Assignment

Step 1. Pull in packages. (rvest) Pull in the website to be scraped.

library('rvest')
## Warning: package 'rvest' was built under R version 3.5.3
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.5.3
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)

Step 2. Use CSS Selectors to scrape the rankings section

rank_data_html <- html_nodes(webpage,'.text-primary')
#convert the ranking data to text.
rank_data <- html_text(rank_data_html)
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."

Step 3. Convert data to numerical format.

rank_data <- as.numeric(rank_data)
head(rank_data)
## [1] 1 2 3 4 5 6

Step 4. Scrape the title section and convert title data to text.

title_data_html <- html_nodes(webpage,'.lister-item-header a')
title_data <- html_text(title_data_html)
head(title_data)
## [1] "Suicide Squad"  "Moonlight"      "Rogue One"      "The Handmaiden"
## [5] "Split"          "La La Land"

Step 5. Scrape description data field and convert to text.

description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
description_data <- html_text(description_data_html)
#remove the '\n'
description_data <- gsub("\n","",description_data)
head(description_data)
## [1] "    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [2] "    A young African-American man grapples with his identity and sexuality while experiencing the everyday struggles of childhood, adolescence, and burgeoning adulthood."               
## [3] "    The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the Death Star plans."                                                                      
## [4] "    A woman is hired as a handmaiden to a Japanese heiress, but secretly she is involved in a plot to defraud her."                                                                     
## [5] "    Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."                      
## [6] "    While navigating their careers in Los Angeles, a pianist and an actress fall in love while attempting to reconcile their aspirations for the future."

Step 6. Scrape runtime section and convert to text.

runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
runtime_data <- html_text(runtime_data_html)
#remove 'mins' and convert to numerical
runtime_data <- gsub(" min","",runtime_data)
runtime_data <- as.numeric(runtime_data)
head(runtime_data)
## [1] 123 111 133 145 117 128

Step 7. Scrape genre section and convert to text. Select only the first genre of each movie.

genre_data_html <- html_nodes(webpage,'.genre')
genre_data <- html_text(genre_data_html)
#remove the "\n" and excess spaces
genre_data <-  gsub("\n","",genre_data)
genre_data <-  gsub(" ","",genre_data)
#select only the first genre of each movie
genre_data <- gsub(",.*","",genre_data)
genre_data <- as.factor(genre_data)
head(genre_data)
## [1] Action Drama  Action Drama  Horror Comedy
## 9 Levels: Action Adventure Animation Biography Comedy Crime ... Mystery

Step 8. Scrape IMDB rating section and convert to text. Convert ratings to numerical.

rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
rating_data <- html_text(rating_data_html)
rating_data <- as.numeric(rating_data)
head(rating_data)
## [1] 6.0 7.4 7.8 8.1 7.3 8.0

Step 9. Scrape the votes section and convert to text.

votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
votes_data <- html_text(votes_data_html)
#remove commas and convert to numerical
votes_data <- gsub(",","",votes_data)
votes_data <- as.numeric(votes_data)
head(votes_data)
## [1] 551853 237211 492326  85398 378250 451278

Step 10. Scrapte directors section and convert to text.

directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
directors_data <- html_text(directors_data_html)
#convert directors into factors
directors_data <- as.factor(directors_data)
head(directors_data)
## [1] David Ayer         Barry Jenkins      Gareth Edwards    
## [4] Chan-wook Park     M. Night Shyamalan Damien Chazelle   
## 99 Levels: Adam Wingard Alex Proyas ... Zack Snyder

Step 11. Scrape the actors section and convert to text.

actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
actors_data <- html_text(actors_data_html)
#convert actors data into factors
actors_data <- as.factor(actors_data)
head(actors_data)
## [1] Will Smith     Mahershala Ali Felicity Jones Min-hee Kim   
## [5] James McAvoy   Ryan Gosling  
## 90 Levels: Aamir Khan Alexander Skarsgård Amy Adams ... Zoey Deutch

Step 12. Scrape the metascore section and convert.

metascore_data_html <- html_nodes(webpage,'.metascore')
metascore_data <- html_text(metascore_data_html)
#remove extra space
metascore_data <- gsub(" ","",metascore_data)
head(metascore_data)
## [1] "40" "99" "65" "84" "62" "93"

Enter the workaround for the movies with missing metascores. My list had 6 metascores missing.

for (i in c(13,29,40,43,66,75)){
  a <- metascore_data[1:(i-1)] 
b <- metascore_data[i:length(metascore_data)] 
metascore_data <- append(a,list("NA")) 
metascore_data <- append(metascore_data,b)}
#convert metascore to numerical
metascore_data <- as.numeric(metascore_data)
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
length(metascore_data)
## [1] 100

Look at the summary statistics.

summary(metascore_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   47.00   59.50   58.68   71.75   99.00       6

Step 13. Scrape the gross revenue section and convert.

gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
gross_data <- html_text(gross_data_html)
#remove "$" and "M" signs.
gross_data <- gsub("[^0-9]*","",gross_data)
#gross_data <- substring(gross_data,2,6)
head(gross_data)
## [1] "32510" "2785"  "53218" "201"   "13829" "15110"
length(gross_data)
## [1] 90

Fill in missing entries with NA.

for (i in c(29,33,40,41,43,72,74,75,76,100)){
  a <- gross_data[1:(i-1)]
  b <- gross_data[i:length(gross_data)]
  gross_data <- append(a,list("NA"))
  gross_data <- append(gross_data,b)}
#convert to numerical
unlist(gross_data)
##   [1] "32510" "2785"  "53218" "201"   "13829" "15110" "6721"  "36307"
##   [9] "33036" "24876" "10055" "23404" "378"   "133"   "8626"  "9343" 
##  [17] "5625"  "5512"  "4302"  "15544" "34127" "078"   "10001" "40808"
##  [25] "502"   "213"   "12834" "1091"  "NA"    "23264" "5870"  "1064" 
##  [33] "NA"    "710"   "8724"  "3626"  "8922"  "7190"  "15371" "NA"   
##  [41] "NA"    "10247" "NA"    "27040" "6618"  "16243" "1263"  "5174" 
##  [49] "6508"  "11326" "588"   "302"   "023"   "421"   "4737"  "15885"
##  [57] "7540"  "12664" "3434"  "12507" "4770"  "3115"  "6268"  "12744"
##  [65] "10314" "1239"  "520"   "16961" "858"   "2686"  "018"   "NA"   
##  [73] "9769"  "NA"    "NA"    "NA"    "36400" "4839"  "5285"  "2078" 
##  [81] "4554"  "2683"  "172"   "36838" "1426"  "3856"  "6143"  "6032" 
##  [89] "3098"  "004"   "066"   "3554"  "1427"  "6727"  "4601"  "7921" 
##  [97] "5465"  "4010"  "1110"  "NA"    "1110"
gross_data <- gross_data[-c(101,102)]
gross_data <- as.numeric(gross_data)
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
length(gross_data)
## [1] 100
summary(gross_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       4    1426    5568    9289   12212   53218      10

Step 14. Combine the lists to form a dataframe.

movies_df <- data.frame(Rank = rank_data, Title = title_data, Description = description_data, Runtime = runtime_data, Genre = genre_data, Rating = rating_data, Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data, Director = directors_data, Actor = actors_data)
#structure of the dataframe
str(movies_df)
## 'data.frame':    100 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : Factor w/ 100 levels "10 Cloverfield Lane",..: 64 51 58 75 62 42 33 24 10 50 ...
##  $ Description         : Factor w/ 100 levels "    19-year-old Billy Lynn is brought home for a victory tour after a harrowing Iraq battle. Through flashbacks"| __truncated__,..: 20 29 83 28 88 98 100 27 47 63 ...
##  $ Runtime             : num  123 111 133 145 117 128 139 108 151 107 ...
##  $ Genre               : Factor w/ 9 levels "Action","Adventure",..: 1 7 1 7 8 5 4 1 1 3 ...
##  $ Rating              : num  6 7.4 7.8 8.1 7.3 8 8.1 8 6.5 7.6 ...
##  $ Metascore           : num  40 99 65 84 62 93 71 65 44 81 ...
##  $ Votes               : num  551853 237211 492326 85398 378250 ...
##  $ Gross_Earning_in_Mil: num  32510 2785 53218 201 13829 ...
##  $ Director            : Factor w/ 99 levels "Adam Wingard",..: 27 14 36 19 59 22 63 94 99 82 ...
##  $ Actor               : Factor w/ 90 levels "Aamir Khan","Alexander Skarsgård",..: 87 53 30 65 36 70 4 71 9 8 ...

Success!

Analyzing scraped data from the web.

Step 1. Pull in packages. Begin to plot the data.

library('ggplot2')
## Warning: package 'ggplot2' was built under R version 3.5.3
library('tidyverse')
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.3     v purrr   0.3.3
## v tidyr   1.0.0     v dplyr   0.8.3
## v readr   1.3.1     v stringr 1.4.0
## v tibble  2.1.3     v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)

Question #1: Based on the above data, which movie from which Genre had the longest runtime?

Step 2. Sort, filter, arrange - to get at the movie with the longest runtime.

movies_ordered <- movies_df %>%
  arrange(desc(Runtime,Genre))
head(movies_ordered)
##   Rank                              Title
## 1   91                     American Honey
## 2   34                            Silence
## 3   66                             Dangal
## 4  100                        The Wailing
## 5    9 Batman v Superman: Dawn of Justice
## 6   24         Captain America: Civil War
##                                                                                                                                                                                                              Description
## 1     A teenage girl with nothing to lose joins a traveling magazine sales crew, and gets caught up in a whirlwind of hard partying, law bending and young love as she criss-crosses the Midwest with a band of misfits.
## 2                                      In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is rumored to have committed apostasy, and to propagate Catholicism.
## 3                                                               Former wrestler Mahavir Singh Phogat and his two wrestler daughters struggle towards glory at the Commonwealth Games in the face of societal oppression.
## 4                         Soon after a stranger arrives in a little village, a mysterious sickness starts spreading. A policeman, drawn into the incident, is forced to solve the mystery in order to save his daughter.
## 5                                                          Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs.
## 6                                                                                                                     Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man.
##   Runtime     Genre Rating Metascore  Votes Gross_Earning_in_Mil
## 1     163 Adventure    7.0        79  33006                   66
## 2     161     Drama    7.1        79  88259                  710
## 3     161    Action    8.4        NA 130707                 1239
## 4     156    Horror    7.4        81  38128                   NA
## 5     151    Action    6.5        44 588451                33036
## 6     147    Action    7.8        75 606316                40808
##          Director           Actor
## 1   Andrea Arnold      Sasha Lane
## 2 Martin Scorsese Andrew Garfield
## 3   Nitesh Tiwari      Aamir Khan
## 4     Hong-jin Na    Jun Kunimura
## 5     Zack Snyder     Ben Affleck
## 6   Anthony Russo     Chris Evans

Answer #1: The movie “American Honey” had the highest runtime (163 minutes) and was in the Adventure Genre.

Step 3. Make more plots.

ggplot(movies_df,aes(x=Runtime,y=Rating))+
  geom_point(aes(size=Votes,col=Genre))

Question #2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

Step 4. Pull the group of movies by the Runtime of interest. Arrange/sort by votes.

#runtime_sub <- movies_df %>%
  #filter(Runtime > 129 | Runtime <161)
#runtime_sub
runtime_sub <- movies_df %>%
  filter(Runtime <= 160 & Runtime >= 130) %>%
  arrange(desc(Votes))
view(runtime_sub)
#runtime_sub <- subset(movies_df, Runtime = 130:160)
#runtime_sub

#runtime_sub <- filter(movies_df(Runtime %in% c(130:160)))
#runtime_sub

#I had a hard time selecting the movies with the right runtime.

Answer #2: In the Runtime subgroup of 130-160 minutes, the Action Genre received the most votes.

Step 5. Make more plots.

ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
  geom_point(aes(size=Rating,col=Genre))
## Warning: Removed 10 rows containing missing values (geom_point).

Question #3: Based on the above data, across all genres, which genre has the highest average gross earnings in runtime 100 to 120?

Step 6. create a subset of the records of interest (runtime between 100-120.)

runtime_short <- movies_df %>%
  filter(Runtime <=120 & Runtime >= 100)
runtime_short_ordered <- runtime_short %>%
  group_by(Genre) %>%
  summarise(sum = sum(Gross_Earning_in_Mil, na.rm = TRUE))
runtime_short_ordered
## # A tibble: 8 x 2
##   Genre        sum
##   <fct>      <dbl>
## 1 Action    161435
## 2 Adventure  36920
## 3 Animation  86545
## 4 Biography   9476
## 5 Comedy     23255
## 6 Crime       8803
## 7 Drama      36738
## 8 Horror     14040

Answer #3: It looks like Action genre has the highest earnings across the genres.