Data 110 - Unit 9 Webscraping

Read IMDB webpage for ranked movies

pacman::p_load(rvest, ggplot2, tidyverse, knitr)               #Loading packages

#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

webpage <- read_html(url)                                      #Reading the HTML code from the website

Select movie rankings and convert to numeric

rank_data_html <- html_nodes(webpage,'.text-primary')            # Using CSS selectors to scrape the rankings section

rank_data <- html_text(rank_data_html)                           # Convert the ranking data to text
head(rank_data)                                                  # look at ranking data

## [1] "1." "2." "3." "4." "5." "6."

rank_data <- as.numeric(rank_data)                               # Convert rankings to numeric  

head(rank_data)

## [1] 1 2 3 4 5 6

length(rank_data)

## [1] 100

Scrape movie titles

title_data_html <- html_nodes(webpage,'.lister-item-header a')   #Using CSS selectors to scrape the title section

title_data <- html_text(title_data_html)                         #Converting the title data to text 
head(title_data)

## [1] "Batman v Superman: Dawn of Justice" "Suicide Squad"                     
## [3] "Captain America: Civil War"         "Captain Fantastic"                 
## [5] "Deadpool"                           "Nocturnal Animals"

length(title_data)

## [1] 100

Scrape and clean descriptions

description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')    # scrape the description section

description_data <- html_text(description_data_html)                        # convert to text

length(description_data)                                                    # look at description data

## [1] 100

head(description_data)

## [1] "\n    Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."                                                                                   
## [2] "\n    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                                             
## [3] "\n    Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man."                                                                                                                                              
## [4] "\n    In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical and intellectual education is forced to leave his paradise and enter the world, challenging his idea of what it means to be a parent."
## [5] "\n    A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                                           
## [6] "\n    A wealthy art gallery owner is haunted by her ex-husband's novel, a violent thriller she interprets as a symbolic revenge tale."

description_data<-gsub("\n","",description_data)         # clean data: remove '\n'

head(description_data)                                   # another look at the description data

## [1] "    Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."                                                                                   
## [2] "    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                                             
## [3] "    Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man."                                                                                                                                              
## [4] "    In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical and intellectual education is forced to leave his paradise and enter the world, challenging his idea of what it means to be a parent."
## [5] "    A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                                           
## [6] "    A wealthy art gallery owner is haunted by her ex-husband's novel, a violent thriller she interprets as a symbolic revenge tale."

Scrape and clean runtime

runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')  # scrape the movie runtile section

runtime_data <- html_text(runtime_data_html)                     # convert to text

length(runtime_data)

## [1] 100

head(runtime_data)

## [1] "152 min" "123 min" "147 min" "118 min" "108 min" "116 min"

runtime_data<-gsub(" min","",runtime_data)                      # remove 'mins' and convert to numeric
runtime_data<-as.numeric(runtime_data)

head(runtime_data)

## [1] 152 123 147 118 108 116

Scrape and clean genre

genre_data_html <- html_nodes(webpage,'.genre')        # scrape the movie genre section and convert to text
genre_data <- html_text(genre_data_html)

length(genre_data)

## [1] 100

head(genre_data)

## [1] "\nAction, Adventure, Sci-Fi            " 
## [2] "\nAction, Adventure, Fantasy            "
## [3] "\nAction, Adventure, Sci-Fi            " 
## [4] "\nComedy, Drama            "             
## [5] "\nAction, Adventure, Comedy            " 
## [6] "\nDrama, Thriller            "

genre_data<-gsub("\n","",genre_data)                   # remove \n and extra spaces
genre_data<-gsub(" ","",genre_data)

genre_data<-gsub(",.*","",genre_data)                  # take only the first genre

genre_data<-as.factor(genre_data)                      # convert from text to factor

head(genre_data)

## [1] Action Action Action Comedy Action Drama 
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror

Scrape and clean rating

rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')   # scrape rating section
rating_data <- html_text(rating_data_html)                              # convert to text

length(rating_data)

## [1] 100

head(rating_data)

## [1] "6.4" "6.0" "7.8" "7.9" "8.0" "7.5"

rating_data<-as.numeric(rating_data)                                    # convert to numeric
head(rating_data)

## [1] 6.4 6.0 7.8 7.9 8.0 7.5

Scrape and clean votes

votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')   # scrape votes section
votes_data <- html_text(votes_data_html)                                             # convert to text

length(votes_data)

## [1] 100

head(votes_data)

## [1] "641,742" "611,108" "675,299" "194,178" "913,147" "242,786"

votes_data<-gsub(",","",votes_data)                                                  # remove commas
votes_data<-as.numeric(votes_data)                                                   # convert to numeric

head(votes_data)

## [1] 641742 611108 675299 194178 913147 242786

Scrape and clean directors

directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')  # scrape directors section

directors_data <- html_text(directors_data_html)                            # convert to text

length(directors_data)

## [1] 100

head(directors_data)

## [1] "Zack Snyder"   "David Ayer"    "Anthony Russo" "Matt Ross"    
## [5] "Tim Miller"    "Tom Ford"

directors_data<-as.factor(directors_data)                                   # convert to factor

Scrape actors data

actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')   # scrape actors section
actors_data <- html_text(actors_data_html)                                 # convert to text

length(actors_data)

## [1] 100

head(actors_data)

## [1] "Ben Affleck"     "Will Smith"      "Chris Evans"     "Viggo Mortensen"
## [5] "Ryan Reynolds"   "Amy Adams"

actors_data<-as.factor(actors_data)                                       # convert to factor

Scrape and clean metascore

This is the way it’s done in the tutorial.

metascore_data_html <- html_nodes(webpage,'.metascore')                  # scrape metascore section
metascore_data <- html_text(metascore_data_html)                         # convert to text

metascore_data<-gsub(" ","",metascore_data)                              # remove extra space

length(metascore_data)

## [1] 98

metascore_data

##  [1] "44" "40" "75" "72" "65" "67" "72" "81" "81" "74" "62" "52" "70" "65" "94"
## [16] "78" "57" "71" "84" "82" "79" "57" "25" "96" "44" "67" "68" "51" "66" "99"
## [31] "59" "81" "72" "48" "41" "71" "69" "51" "48" "54" "65" "58" "66" "47" "32"
## [46] "46" "49" "88" "21" "32" "76" "51" "33" "36" "51" "78" "78" "34" "66" "79"
## [61] "42" "40" "60" "60" "66" "33" "28" "77" "42" "55" "45" "59" "42" "83" "47"
## [76] "26" "58" "36" "77" "67" "64" "68" "60" "77" "69" "70" "66" "55" "18" "74"
## [91] "62" "81" "52" "93" "64" "42" "23" "72"

Find missing metascores

Movies #14 and #57 are missing metascore. The metascore is part of the ratings bar in the html. Scrape the ratings bar data and extract the metascore with a regular expression. This will give the NA values in context.

ratings_bar_data <- html_nodes(webpage,'.ratings-bar') %>%      # scrape the ratings bar and convert to text
  html_text2()

head(ratings_bar_data)                                                 # look at the ratings bar

## [1] "6.4\nRate this\n 1 2 3 4 5 6 7 8 9 10 6.4/10 X \n44 Metascore"
## [2] "6.0\nRate this\n 1 2 3 4 5 6 7 8 9 10 6/10 X \n40 Metascore"  
## [3] "7.8\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.8/10 X \n75 Metascore"
## [4] "7.9\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.9/10 X \n72 Metascore"
## [5] "8.0\nRate this\n 1 2 3 4 5 6 7 8 9 10 8/10 X \n65 Metascore"  
## [6] "7.5\nRate this\n 1 2 3 4 5 6 7 8 9 10 7.5/10 X \n67 Metascore"

metascore_data <- str_match(ratings_bar_data, "\\d{2} Metascore") %>%  # extract Metascore 
  str_match("\\d{2}") %>% 
  as.numeric()                                                         # convert to number  

length(metascore_data)

## [1] 100

metascore_data

##   [1] 44 40 75 72 65 67 72 81 81 74 62 52 70 NA 65 94 78 57 71 84 82 79 57 25 96
##  [26] 44 67 68 51 66 99 59 81 72 48 41 71 69 51 48 54 65 58 66 47 32 46 49 88 21
##  [51] 32 76 51 33 36 51 NA 78 78 34 66 79 42 40 60 60 66 33 28 77 42 55 45 59 42
##  [76] 83 47 26 58 36 77 67 64 68 60 77 69 70 66 55 18 74 62 81 52 93 64 42 23 72

summary(metascore_data)                                               # summary statistics

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.00   47.00   63.00   59.91   72.00   99.00       2

Scrape earnings

This is the way it’s done in the tutorial.

gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')    # scrape gross earnings section
gross_data <- html_text(gross_data_html)                              # convert to text

gross_data<-gsub("M","",gross_data)                                   # remove 'M'
gross_data<-substring(gross_data,2,6)                                 # remove '$' sign

length(gross_data)

## [1] 93

head(gross_data)

## [1] "330.3" "325.1" "408.0" "5.88"  "363.0" "10.64"

Find missing earnings

Movies #14, 58, 64, 75, 82, 90, 94 are missing earnings. Earnings are part of the votes bar in the html, scrape the votes bar and extract earnings with a regular expression to get the NAs in context.

# scrape the votess bar and convert to text
votes_bar_data <- html_nodes(webpage,'.sort-num_votes-visible') %>% 
  html_text2()

head(votes_bar_data)                                                 # look at the votes bar data

## [1] "Votes: 641,742 | Gross: $330.36M" "Votes: 611,108 | Gross: $325.10M"
## [3] "Votes: 675,299 | Gross: $408.08M" "Votes: 194,178 | Gross: $5.88M"  
## [5] "Votes: 913,147 | Gross: $363.07M" "Votes: 242,786 | Gross: $10.64M"

gross_data <- str_match(votes_bar_data, "\\$.+$")                    # extract the gross earnings

gross_data <- gsub("M","",gross_data)                                # clean data: remove 'M' sign 

gross_data <- substring(gross_data,2,6) %>%                          # clean data: remove '$' sign                    
  as.numeric()

length(gross_data)

## [1] 100

gross_data

##   [1] 330.30 325.10 408.00   5.88 363.00  10.64 232.60 100.50 248.70 169.60
##  [11] 138.20 155.40  36.26     NA 532.10 151.10 341.20  87.24  67.21   2.01
##  [21]   0.23   5.02  43.03  31.15  47.70 126.60   1.91 158.80  56.25 234.00
##  [31]  27.85 270.40   5.20   2.13  75.40 100.00  89.22  51.74  86.26  52.85
##  [41]  93.43 102.40 162.40  55.48   8.11  47.37  65.08  10.38  26.86  35.59
##  [51] 103.10  72.08   1.33  34.92  54.65  46.01  12.39     NA   8.58  14.90
##  [61]  12.79   7.10  34.34     NA 113.20 128.30  97.69  66.18  62.68 364.00
##  [71]  45.54 153.70  10.91  55.12     NA   3.37  58.70  18.71  21.59  14.27
##  [81]  14.43     NA   3.44  61.43  40.10 486.30  31.89   3.02  15.43     NA
##  [91]  32.46 125.00   0.18     NA  12.63   1.48   4.21  35.82  30.35  30.08

summary(gross_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.18   14.27   47.70   89.49  103.10  532.10       7

Combine data

# Combining all the lists to form a tibble

movies_df<- tibble('Rank' = rank_data, 
                    'Title' = title_data,
                      'Description' = description_data,
                      'Run_time' = runtime_data,
                      'Genre' = genre_data,
                      'Rating' = rating_data,
                      'Metascore' = metascore_data, 
                      'Votes' = votes_data, 
                      'Gross_Earning_in_Mil' = gross_data,
                      'Director' = directors_data, 
                      'Actor' = actors_data)

str(movies_df)      #Structure of the tibble

## tibble [100 x 11] (S3: tbl_df/tbl/data.frame)
##  $ Rank                : num [1:100] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr [1:100] "Batman v Superman: Dawn of Justice" "Suicide Squad" "Captain America: Civil War" "Captain Fantastic" ...
##  $ Description         : chr [1:100] "    Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world "| __truncated__ "    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defens"| __truncated__ "    Political involvement in the Avengers' affairs causes a rift between Captain America and Iron Man." "    In the forests of the Pacific Northwest, a father devoted to raising his six kids with a rigorous physical "| __truncated__ ...
##  $ Run_time            : num [1:100] 152 123 147 118 108 116 115 116 107 127 ...
##  $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 1 1 1 5 1 7 1 7 3 4 ...
##  $ Rating              : num [1:100] 6.4 6 7.8 7.9 8 7.5 7.5 7.9 7.6 7.8 ...
##  $ Metascore           : num [1:100] 44 40 75 72 65 67 72 81 81 74 ...
##  $ Votes               : num [1:100] 641742 611108 675299 194178 913147 ...
##  $ Gross_Earning_in_Mil: num [1:100] 330.3 325.1 408 5.88 363 ...
##  $ Director            : Factor w/ 98 levels "Alex Proyas",..: 98 23 6 61 93 95 85 26 81 91 ...
##  $ Actor               : Factor w/ 90 levels "Aamir Khan","Alexander Skarsgård",..: 8 88 20 87 73 3 9 3 7 82 ...

Question 1

Based on the above data, which movie from which Genre had the longest runtime?

qplot(data = movies_df,Run_time,fill = Genre,bins = 30)

Answer 1: The comedy “Toni Erdmann” with runtime = 162 minutes.

movies_df %>% filter(Run_time == max(Run_time)) %>% 
  kable()

Rank	Title	Description	Run_time	Genre	Rating	Metascore	Votes	Gross_Earning_in_Mil	Director	Actor
96	Toni Erdmann	A practical joking father tries to reconnect with his hard working daughter by creating an outrageous alter ego and posing as her CEO’s life coach.	162	Comedy	7.4	93	44824	1.48	Maren Ade	Sandra Hüller

Question 2

Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

ggplot(movies_df,aes(x=Run_time,y=Rating))+
geom_point(aes(size=Votes,col=Genre))

Answer 2: Action has the most votes.

movies_df %>% filter(Run_time >= 130 & Run_time <= 160) %>% 
  group_by(Genre) %>% 
  summarize(Votes = sum(Votes)) %>% 
  arrange(desc(Votes)) %>% 
  kable(format = "html")

Genre	Votes
Action	2679258
Biography	665280
Drama	522131
Adventure	414994
Horror	281926
Animation	51425

Question 3

Based on the above data, across all genres which genre has the highest average gross earnings in Run_time 100 to 120.

ggplot(movies_df,aes(x=Run_time,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre), na.rm = TRUE)

Answer 3: Animation has the highest average gross earnings of $216.33 million.

movies_df %>% 
  filter(Run_time >= 100 & Run_time <= 120 & !is.na(Gross_Earning_in_Mil)) %>% 
   group_by(Genre) %>% 
   summarize(Avg_earnings = mean(Gross_Earning_in_Mil))  %>% 
   arrange(desc(Avg_earnings)) %>% 
   kable(format = "html", digits = 2)

Genre	Avg_earnings
Animation	216.33
Adventure	141.56
Crime	75.40
Horror	69.76
Action	66.83
Drama	52.33
Biography	35.85
Comedy	29.41