Web scraping IMDB with rvest

In this assignment, we will scrape IMDB with rvest package of R and create a dataframe with details of top 250 movies of IMDB. Top 250 movies of IMDB is a very popular category for movies and it would be useful to scrape details of these movies and make them available for analysis. Once dataset of 250 movies is available, we will establish one or two hypotheses through our analysis.

Below code mentions packages needed for executing code in this document, please ensure that these packages are installed if you are running these code chunks in R environment

library("rvest")

## Loading required package: xml2

library("XML")

## 
## Attaching package: 'XML'

## The following object is masked from 'package:rvest':
## 
##     xml

library("xml2")

Below code chunk is to create dataset of IMDB top 250 with six variables - Movie name, movie cast, movie link, year in which movie was released, votes received for movie and overall rating of movie

# IMDB Top 250 Movies
url = "http://www.imdb.com/chart/top?ref_=nv_wl_img_3"

page = read_html(url)

movie.nodes <- html_nodes(page,'.titleColumn a')

movie.link = sapply(html_attrs(movie.nodes),`[[`,'href')
movie.link = paste0("http://www.imdb.com",movie.link)
movie.cast = sapply(html_attrs(movie.nodes),`[[`,'title')
movie.name = html_text(movie.nodes)

sec <- html_nodes(page,'.secondaryInfo')

year = as.numeric(gsub(")","",                          # Removing )
            gsub("\\(","",                   # Removing (
                 html_text( sec )                 # get text of HTML node  
                   )))

rating.nodes = html_nodes(page,'.imdbRating')
# Check One node
xmlTreeParse(rating.nodes[[20]])

## $doc
## $file
## [1] "<buffer>"
## 
## $version
## [1] "1.0"
## 
## $children
## $children$td
## <td class="ratingColumn imdbRating">
##  <strong title="8.6 based on 925,517 user ratings">8.6</strong>
## </td>
## 
## 
## attr(,"class")
## [1] "XMLDocumentContent"
## 
## $dtd
## $external
## NULL
## 
## $internal
## NULL
## 
## attr(,"class")
## [1] "DTDList"
## 
## attr(,"class")
## [1] "XMLDocument"         "XMLAbstractDocument"

rating.nodes = html_nodes(page,'.imdbRating strong')
votes = as.numeric(gsub(',','',
                        gsub(' user ratings','',
                             gsub('.*?based on ','',
                                  sapply(html_attrs(rating.nodes),`[[`,'title')
                             ))))

rating = as.numeric(html_text(rating.nodes))

top250 <- data.frame(movie.name, movie.cast, movie.link,year,votes,rating)

Task 1: Code to find all movies released between 1996 and 1998

ConditionalIMDB<-subset(top250,year >= 1996 & year <= 1998)
row.names(ConditionalIMDB) <- NULL

List of movies released between 1996 and 1998

ConditionalIMDB[c("movie.name","year")]

##                             movie.name year
## 1                      La vita è bella 1997
## 2                  Saving Private Ryan 1998
## 3                   American History X 1998
## 4                        Mononoke-hime 1997
## 5                    L.A. Confidential 1997
## 6                    Good Will Hunting 1997
## 7                    Bacheha-Ye aseman 1997
## 8  Lock, Stock and Two Smoking Barrels 1998
## 9                     The Big Lebowski 1998
## 10                       Trainspotting 1996
## 11                               Fargo 1996
## 12                     The Truman Show 1998

Task 2&3: Code to collate information on Director, Stars, Tagline, Genre, Storyline, box office budget and gross of top 250 IMDB movies

#read all urls & store the values
url1 <- as.character(top250$movie.link)

# Looping starts here 
movie.db <- lapply(url1,function(x) {  
  
  page <- read_html(x)
  movie.gross <- html_node(page,'#titleDetails :nth-child(13)')
  gross <- gsub("^$|^[ \t\n\v\r\f]+","NA",
                gsub("[^0-9]","",
                     gsub("\\(.*?\\)","",as.character(html_text((movie.gross)))
            )))
  
  movie.genre <- html_nodes(page,'.subtext .itemprop')
  genre <-  as.character(html_text(movie.genre)[1])
  
  movie.dir <- html_nodes(page,'.credit_summary_item:nth-child(2) .itemprop')
  director <- gsub("^$|^[ \t\n\v\r\f]+","NA",
                as.character(html_text(movie.dir)[1]))
  
  
  movie.tag <- html_node(page,'#titleStoryLine .txt-block:nth-child(8)')
  tagline <- gsub("^$|^[ \t\n\v\r\f]+","NA",
              gsub("^Taglines:","",
                  gsub("\n|^([ \t\n\f\r]+)|([ \t\n\r\f]+)$|","",as.character(html_text(movie.tag))
                  )))
  
  
  movie.budget <- html_node(page,'#titleDetails :nth-child(11)')
  xmlTreeParse(movie.budget)
  budget <- as.numeric(gsub("[^0-9]","",as.character(html_text(movie.budget))))
  budget <- format(budget,scientific = FALSE)
  
  movie.story <- html_node(page,'.summary_text')
  xmlTreeParse(movie.story)
  story <- gsub("^$|^[ \t\n\v\r\f]+","NA",
            gsub("^([ \t\n\r\f]+)|([ \t\n\f\r]+)$","",
                gsub("\n","",as.character(html_text(movie.story))
                )))
  
  movie.stars <- html_node(page,'.credit_summary_item:nth-child(4)')
  xmlTreeParse(movie.stars)
  as.character(html_text(movie.stars))
  
  stars <- gsub("^$|^[ \t\n\v\r\f]+","NA",
              gsub("\\|See full cast & crew »$","",
                gsub("^Stars: ","",
                     gsub(", ",",",
                          gsub("\\s+"," ",      
                               gsub("^([ \t\n\r\v\f]+)|([ \t\n\r\f\v]+)$","",
                                    gsub("\n","",as.character(html_text(movie.stars)))
                         ))))))

  mov.total <- data.frame(director,budget,tagline,stars,story,gross,genre)
  
})

movie.tot <- do.call(rbind.data.frame, movie.db)

movie.tot <- cbind(movie.name,movie.tot,url1)

Task 4 & 4a: Code to visualize movie count v/s genre in form of table and bar plot

movieCount <- table(movie.tot$genre)
movieCount

## 
##     Crime    Action Biography Adventure   Western     Drama    Comedy 
##        39        35        21        29         3        67        25 
## Animation    Horror   Mystery Film-Noir    Sci-Fi 
##        20         3         5         2         1

barplot(movieCount,main="Movie Genre Distribution",xlab="Genre",ylab="Movie Count")

The bar plot depicts the fact that Drama genre movies are large in number followed by movies in Crime and Action category. Usually drama genre movies are watched across different age groups and different segments of population and hence fetching more votes to enable them to feature in Top 250 movies category.

One more interesting attribution to this could be that popular awards like academy awards are generally given to drama genre movies and hence could influence voting of users in favor of these movies. For instance, Sci-fi genre features only one movie in top 250 and we know how many sci-fi movies actually win academy awards. Probably this analysis supplemented by analysis on academy award winning movies will make this picture clear.A good hypothesis here would be academy award would help a movie feature in IMDB Top 250.

Genre like animation is picking up in recent times and it would be interesting to check if it becomes bigger than any of the genres

Code to print number of movies of each director

dr <- summary(movie.tot$director)
dr

##                Christopher Nolan                 Steven Spielberg 
##                                7                                7 
##                  Martin Scorsese                 Alfred Hitchcock 
##                                7                                7 
##                  Stanley Kubrick                   Hayao Miyazaki 
##                                7                                6 
##                Quentin Tarantino                     Sergio Leone 
##                                5                                5 
##                   Akira Kurosawa                  Charles Chaplin 
##                                5                                5 
##                     Billy Wilder                     Ridley Scott 
##                                5                                4 
##                   Ingmar Bergman             Francis Ford Coppola 
##                                4                                3 
##                     Sidney Lumet                    Peter Jackson 
##                                3                                3 
##                    David Fincher                      Frank Capra 
##                                3                                3 
##                    James Cameron                      Pete Docter 
##                                3                                3 
##                   Clint Eastwood                    William Wyler 
##                                3                                3 
##                   Frank Darabont                  Robert Zemeckis 
##                                2                                2 
##                     Milos Forman                   Roman Polanski 
##                                2                                2 
##                   Andrew Stanton                     Orson Welles 
##                                2                                2 
##                       Fritz Lang                       David Lean 
##                                2                                2 
##                  George Roy Hill                      Guy Ritchie 
##                                2                                2 
##                    Terry Gilliam                      John Huston 
##                                2                                2 
##                       Ron Howard                 Denis Villeneuve 
##                                2                                2 
##                        Joel Coen                   Victor Fleming 
##                                2                                2 
##                 Federico Fellini            Henri-Georges Clouzot 
##                                2                                2 
##                       Rob Reiner            Alejandro G. Iñárritu 
##                                2                                2 
##                   Irvin Kershner                   Lana Wachowski 
##                                1                                1 
##                     George Lucas               Fernando Meirelles 
##                                1                                1 
##                   Jonathan Demme                     Bryan Singer 
##                                1                                1 
##                  Roberto Benigni                       Luc Besson 
##                                1                                1 
##                        Tony Kaye                   Michael Curtiz 
##                                1                                1 
##                  Olivier Nakache                  Damien Chazelle 
##                                1                                1 
##                     Roger Allers Florian Henckel von Donnersmarck 
##                                1                                1 
##               Giuseppe Tornatore                    Isao Takahata 
##                                1                                1 
##                       Sam Mendes                   Chan-wook Park 
##                                1                                1 
##                Wolfgang Petersen                 Richard Marquand 
##                                1                                1 
##                       Mel Gibson               Jean-Pierre Jeunet 
##                                1                                1 
##                 Darren Aronofsky                  Robert Mulligan 
##                                1                                1 
##                      Lee Unkrich                    Michel Gondry 
##                                1                                1 
##                 Vittorio De Sica                    John Lasseter 
##                                1                                1 
##                    Stanley Donen                    Curtis Hanson 
##                                1                                1 
##                   Brian De Palma                  Rajkumar Hirani 
##                                1                                1 
##                   Asghar Farhadi                Thomas Vinterberg 
##                                1                                1 
##                     Gus Van Sant             Joseph L. Mankiewicz 
##                                1                                1 
##                       Carol Reed              Oliver Hirschbiegel 
##                                1                                1 
##                     Majid Majidi                   John McTiernan 
##                                1                                1 
##                     John Sturges                     Michael Mann 
##                                1                                1 
##               Guillermo del Toro                 Lenny Abrahamson 
##                                1                                1 
##                       Elia Kazan             Juan José Campanella 
##                                1                                1 
##              Carl Theodor Dreyer                   Stanley Kramer 
##                                1                                1 
##                   Clyde Bruckman                      David Lynch 
##                                1                                1 
##                      F.W. Murnau                   Gavin O'Connor 
##                                1                                1 
##                   James McTeigue                     Yasujirô Ozu 
##                                1                                1 
##                   Michael Cimino                      Danny Boyle 
##                                1                                1 
##                      J.J. Abrams                          (Other) 
##                                1                               52

No surprises here - movies of popular directors like Christopher Nolan, Steven spielberg, Martin Scorsese feature more in IMDB top 250 movies category

Code to analyze on budget of different genre movies. We have extracted average budget of movies belonging to each genre.There are several movies with NA category. We eliminated them and then convereted the column values to integer. In conversion as well, some of the values were converted to NA.

subsetBudgetData <- subset(movie.tot,select = c("genre","budget"))
subsetBudgetData1 <- subset(subsetBudgetData, subsetBudgetData$budget != 'NA')
subsetBudgetData1$budget <- as.integer(as.character(subsetBudgetData1$budget))

## Warning: NAs introduced by coercion to integer range

subsetBudgetData1 <- subset(subsetBudgetData1, subsetBudgetData1$budget != 'NA')
aggregate(subsetBudgetData1[2], list(subsetBudgetData1$genre), mean)

##      Group.1    budget
## 1      Crime  67137610
## 2     Action  96976286
## 3  Biography  37186534
## 4  Adventure  53664335
## 5    Western   6100000
## 6      Drama  94435243
## 7     Comedy 156135641
## 8  Animation 108840000
## 9     Horror  21500000
## 10   Mystery  44500000
## 11    Sci-Fi  28000000

Top 3 categories in terms of average budget are Comedy, Animation and Action. While it can be understood that animation movies have bigger budgets because of investment in graphical technologies and action movies have bigger budgets because of outdoor shooting, it is surprising to see comedy genre in top 3.