Group Homework #2 - Webscraping with rvest

R Code to extract Data of Top 250 movies from IMDB

library(plyr)
# this code basically installed packages if not installed already and load the mentioned packages 
if (!require("pacman")) install.packages("pacman")

## Loading required package: pacman

pacman::p_load("rvest","XML","stringi")

#set working dir
setwd("C:\\Users\\ADMIN\\Google Drive\\DC\\DC_GA_2")

#url of IMDB where TOP 250 movies listed
url = "http://www.imdb.com/chart/top?ref_=nv_wl_img_3"
# read the html text 
page = read_html(url)
#filter html data by css element 
movie.nodes = html_nodes(page,'.titleColumn a')
#As we can see from above, movie nodes have three type of information. Movie link, 
#Movie Name and Movie cast. So we will extract all the information stored in movie nodes and 
#store them as separate vector for each field.
movie.link = sapply(html_attrs(movie.nodes),`[[`,'href')
movie.link =paste0("http://www.imdb.com",movie.link)
movie.cast = sapply(html_attrs(movie.nodes),`[[`,'title')
movie.name = html_text(movie.nodes)

#As we found movie nameâ css selector from SelectorGadget, 
#we can find yearâ css selector and use rvest for scraping year information from the page
year = gsub(")","", gsub("\\(","", html_text(html_nodes(page,'.secondaryInfo'))))

# get rating node
rating.nodes = html_nodes(page,'.imdbRating')
rating.nodes = html_nodes(page,'.imdbRating strong')
votes = as.numeric(gsub(',','',
                        gsub(' user ratings','',
                             gsub('.*?based on ','',
                                  sapply(html_attrs(rating.nodes),`[[`,'title')
                             ))))

rating = as.numeric(html_text(rating.nodes))

# create one temp variable to hold number of rows in data set
rows <- length(movie.name)

#create data frame for same
#Alert: We are adding Rank column also to make it for cross verification
rank <- c(1:rows)
top250 = data.frame(rank,movie.name, movie.cast, movie.link,year,votes,rating)
#save as csv file
write.csv(top250,'IMDB Top 250.csv', row.names = F)

Snapshot of top 250 movies

##   rank               movie.name
## 1    1 The Shawshank Redemption
## 2    2            The Godfather
## 3    3   The Godfather: Part II
## 4    4          The Dark Knight
## 5    5             12 Angry Men
## 6    6         Schindler's List
##                                               movie.cast
## 1     Frank Darabont (dir.), Tim Robbins, Morgan Freeman
## 2  Francis Ford Coppola (dir.), Marlon Brando, Al Pacino
## 3 Francis Ford Coppola (dir.), Al Pacino, Robert De Niro
## 4 Christopher Nolan (dir.), Christian Bale, Heath Ledger
## 5          Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb
## 6    Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes
##                                                                                                                                                               movie.link
## 1 http://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1
## 2 http://www.imdb.com/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2
## 3 http://www.imdb.com/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_3
## 4 http://www.imdb.com/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_4
## 5 http://www.imdb.com/title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_5
## 6 http://www.imdb.com/title/tt0108052/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_6
##   year   votes rating
## 1 1994 1718456    9.2
## 2 1972 1174559    9.2
## 3 1974  804922    9.0
## 4 2008 1704632    8.9
## 5 1957  457572    8.9
## 6 1993  880006    8.9

Task 1: Find all movies that released between 1996 and 1998 (both years including)

# top250$year is a factor, not a numeric vector. Transform it first
top250$year <- as.numeric(as.character(top250$year))
movies1996_1998 <- subset(top250, top250$year > 1995 & top250$year < 1999 )
write.csv(movies1996_1998,'movies1996_1998.csv', row.names = F)

Movies that released during 1996-1998 time period

	rank	movie.name	movie.cast	movie.link	year	votes	rating
26	26	La vita è bella	Roberto Benigni (dir.), Roberto Benigni, Nicoletta Braschi	http://www.imdb.com/title/tt0118799/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_26	1997	429117	8.6
29	29	Saving Private Ryan	Steven Spielberg (dir.), Tom Hanks, Matt Damon	http://www.imdb.com/title/tt0120815/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_29	1998	897416	8.5
31	31	American History X	Tony Kaye (dir.), Edward Norton, Edward Furlong	http://www.imdb.com/title/tt0120586/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_31	1998	795211	8.5
65	65	Mononoke-hime	Hayao Miyazaki (dir.), Yôji Matsuda, Yuriko Ishida	http://www.imdb.com/title/tt0119698/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_65	1997	226361	8.4
97	97	L.A. Confidential	Curtis Hanson (dir.), Kevin Spacey, Russell Crowe	http://www.imdb.com/title/tt0119488/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_97	1997	420345	8.3
107	107	Good Will Hunting	Gus Van Sant (dir.), Robin Williams, Matt Damon	http://www.imdb.com/title/tt0119217/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_107	1997	616586	8.2
118	118	Bacheha-Ye aseman	Majid Majidi (dir.), Mohammad Amir Naji, Amir Farrokh Hashemian	http://www.imdb.com/title/tt0118849/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_118	1997	29686	8.2
136	136	Lock, Stock and Two Smoking Barrels	Guy Ritchie (dir.), Jason Flemyng, Dexter Fletcher	http://www.imdb.com/title/tt0120735/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_136	1998	420939	8.2
151	151	The Big Lebowski	Joel Coen (dir.), Jeff Bridges, John Goodman	http://www.imdb.com/title/tt0118715/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_151	1998	546858	8.1
155	155	Trainspotting	Danny Boyle (dir.), Ewan McGregor, Ewen Bremner	http://www.imdb.com/title/tt0117951/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_155	1996	476883	8.1
157	157	Fargo	Joel Coen (dir.), William H. Macy, Frances McDormand	http://www.imdb.com/title/tt0116282/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_157	1996	461963	8.1
206	206	The Truman Show	Peter Weir (dir.), Jim Carrey, Ed Harris	http://www.imdb.com/title/tt0120382/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_206	1998	680829	8.0

Task 2: Scrape the following information: Director, stars,Taglines, Genres, (partial) storyline, Box office budget and box office gross.

#created Vectors to hold column data
directors <- c(1:rows)
movie.stars <- c(1:rows)
taglines <- c(1:rows)
storyline <- c(1:rows)
generes <- c(1:rows)
movie.budget <- c(1:rows)
movie.gross <- c(1:rows)

# i is used as counter here and assign it to 1
i <- 1
# loop over all the movie links and extract required data

for(link in top250$movie.link) {
  # get gross,budget and tagline earning of a movie
  page <- read_html(link)
  tagline <- ""
  budget <- ""
  gross <- ""
  nodes <- html_nodes(page,".txt-block")
  nodes <- html_text(nodes)
  for (variable in nodes) {
    variable
    # if node is of type taglines
    if (regexpr("Taglines:",variable)[1] != -1) {
      # Warning: We are not extracting all taglines but only tagline which is available on movie page
      # This can be done to create a link and call function read_html over it.
      tagline <- variable
      tagline <- substr(tagline,regexpr(':', tagline)[1]+2 , nchar(tagline))
      tagline <- gsub("^ *|(?<= ) | *$", "", tagline, perl=T)
      next
    }
    # if node is of type Budget
    if (regexpr("Budget:",variable)[1] != -1) {
      budget <- variable
      budget <-  substr(budget,regexpr(':', budget)[1]+1 , regexpr("\n\n",budget)[1]-1)
      budget <- gsub("^ *|(?<= ) | *$", "", budget, perl=T)
      next
    }
    # if node is of type Gross
    if (regexpr("Gross:",variable)[1] != -1) {
      gross <- variable
      gross <-  substr(gross,regexpr(':', gross)[1]+1 , regexpr("\n\n",gross)[1]-1)
      gross <- gsub("^ *|(?<= ) | *$", "", gross, perl=T)
      next
    }
  } # end of for loop to extract data for gross, budget and tagline
  taglines[i] <- tagline
  movie.gross[i] <- gross
  movie.budget[i] <- budget
  
  # get the genres of a given movie
  node <- html_nodes(page,".canwrap")
  canwrap <- html_text(node)
  canwrap <- gsub("\n", "", canwrap)
  pos <- regexpr(':', canwrap[3]) # Returns position of 1st match
  genereList <- substr(canwrap[3], pos+1,nchar(canwrap[3]))
  genereList <- gsub("^ *|(?<= ) | *$", "", genereList, perl=T)
  genereList <- gsub("[|]", ",", genereList, perl=T) 
  generes[i] <- genereList
  
  # get the stroyline
  story <-  gsub("^ *|(?<= ) | *$", "", canwrap[1], perl=T)
  storyline[i] <- story
  
  # get the director(s) name(s)
  node <- html_nodes(page,".credit_summary_item")
  casts <- html_text(node)
  casts <- gsub("\n", "", casts)
  dir <- strsplit(casts[1],":")
  name <- gsub("^ *|(?<= ) | *$", "", dir, perl=T)
  name <- strsplit(name, "\"")
  directors[i] <- name[[1]][4]
  
  # get the main star cast
  dir <- strsplit(casts[3],":")
  name <- gsub("^ *|(?<= ) | *$", "", dir, perl=T)
  name <- gsub("[\"]", "", name, perl=T)
  a <- nchar("c( Stars, ")
  b <- nchar("|See full cast & crew Â» )")
  c <- nchar(name)
  name <- substr(name,a,c-b)
  name <- gsub("^ *|(?<= ) | *$", "", name, perl=T)
  movie.stars[i] <- name

    #increment counter to get next movie link
  i = i + 1
}

Task 3: create data frame from extracted information

top250_movies_details = data.frame(top250$movie.name,directors, taglines, movie.stars,generes, storyline, movie.gross,movie.budget)
#rename some of the columns
top250_movies_details <- rename(top250_movies_details,c("top250.movie.name" = "movie.name"))
# wrting this information back into csv file
write.csv(top250_movies_details,'top250_movies_details.csv', row.names = F)

Display head of top 250 movies details

head(top250_movies_details)

##                 movie.name              directors
## 1 The Shawshank Redemption        Frank Darabont 
## 2            The Godfather  Francis Ford Coppola 
## 3   The Godfather: Part II  Francis Ford Coppola 
## 4          The Dark Knight     Christopher Nolan 
## 5             12 Angry Men          Sidney Lumet 
## 6         Schindler's List      Steven Spielberg 
##                                                                         taglines
## 1                             Fear can hold you prisoner. Hope can set you free.
## 2                                  An offer you can't refuse. \n See more »\n \n
## 3                                                                               
## 4                                   I Believe In Harvey Dent. \n See more »\n \n
## 5 They have twelve scraps of paper... Twelve chances to kill! \n See more »\n \n
## 6             Whoever saves one life, saves the world entire. \n See more »\n \n
##                                   movie.stars
## 1     Tim Robbins, Morgan Freeman, Bob Gunton
## 2        Marlon Brando, Al Pacino, James Caan
## 3    Al Pacino, Robert De Niro, Robert Duvall
## 4 Christian Bale, Heath Ledger, Aaron Eckhart
## 5     Henry Fonda, Lee J. Cobb, Martin Balsam
## 6    Liam Neeson, Ralph Fiennes, Ben Kingsley
##                             generes
## 1                     Crime , Drama
## 2                     Crime , Drama
## 3                     Crime , Drama
## 4 Action , Crime , Drama , Thriller
## 5                     Crime , Drama
## 6       Biography , Drama , History
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          storyline
## 1                                                                                                                                                                                               Chronicles the experiences of a formerly successful banker as a prisoner in the gloomy jailhouse of Shawshank after being found guilty of a crime he did not commit. The film portrays the man's unique way of dealing with his new, torturous life; along the way he befriends a number of fellow prisoners, most notably a wise long-term inmate named Red. Written byJ-S-Golden
## 2                                                                                                                                                                                                             When the aging head of a famous crime family decides to transfer his position to one of his subalterns, a series of unfortunate events start happening to the family, and a war begins between all the well-known families leading to insolence, deportation, murder and revenge, and ends with the favorable successor being finally chosen. Written byJ. S. Golden
## 3                                                                                                                                                                                                                                                                                The continuing saga of the Corleone crime family tells the story of a young Vito Corleone growing up in Sicily and in 1910s New York; and follows Michael Corleone in the 1950s as he attempts to expand the family business into Las Vegas, Hollywood and Cuba. Written byKeith Loh <loh@sfu.ca>
## 4 Set within a year after the events of Batman Begins, Batman, Lieutenant James Gordon, and new district attorney Harvey Dent successfully begin to round up the criminals that plague Gotham City until a mysterious and sadistic criminal mastermind known only as the Joker appears in Gotham, creating a new wave of chaos. Batman's struggle against the Joker becomes deeply personal, forcing him to "confront everything he believes" and improve his technology to stop him. A love triangle develops between Bruce Wayne, Dent and Rachel Dawes. Written byLeon Lombardi
## 5                                         The defense and the prosecution have rested and the jury is filing into the jury room to decide if a young man is guilty or innocent of murdering his father. What begins as an open-and-shut case of murder soon becomes a detective story that presents a succession of clues creating doubt, and a mini-drama of each of the jurors' prejudices and preconceptions about the trial, the accused, and each other. Based on the play, all of the action takes place on the stage of the jury room. Written bypjk <PETESID@VNET.IBM.COM>
## 6                                                                                                                                    Oskar Schindler is a vainglorious and greedy German businessman who becomes an unlikely humanitarian amid the barbaric German Nazi reign when he feels compelled to turn his factory into a refuge for Jews. Based on the true story of Oskar Schindler who managed to save about 1100 Jews from being gassed at the Auschwitz concentration camp, it is a testament to the good in all of us. Written byHarald Mayr <marvin@bike.augusta.de>
##    movie.gross movie.budget
## 1  $28,341,469  $25,000,000
## 2 $134,821,952   $6,000,000
## 3  $57,300,000  $13,000,000
## 4 $533,316,061 $185,000,000
## 5                  $350,000
## 6  $96,067,179  $22,000,000

Task 4 Make a table movie-count versus Genres.

splitgenre <- strsplit(as.character(top250_movies_details$generes), ',') #Splitting multiple genres of the same movie
unlistgenre <- data.frame(genre=unlist(splitgenre)) #Converting genres into single column
genre <- stri_trim(unlistgenre$genre) #Removing spaces
genresummary <- data.frame(table(genre)) #Counting # of movies by each genre
write.csv(genresummary, 'GenreOfTop250Movies.csv', row.names = F) #Saving output in csv

Frequency Table and Bar graph of Movies counts vs Genre

genre	Freq
Action	36
Adventure	62
Animation	20
Biography	26
Comedy	42
Crime	56
Drama	176
Family	24
Fantasy	33
Film-Noir	7
History	17
Horror	5
Music	2
Musical	5
Mystery	36
Romance	24
Sci-Fi	31
Sport	7
Thriller	63
War	30
Western	10

Task 4a: Hypothesis testing

Hypothesis : We wanted to check and understand how to different genres perform w.r.t. Gross, Budget and ROI (i.e. Gross/Budget) to simplify, we have considered only 166 movies from top 250 US movies that have both Budget and Gross available in USD,Also, due to lack of time, this particular analysis is partly done on R (data cleaning) and partly in Excel (summarization)

data <- top250_movies_details
data$gross <- gsub("[^0-9///' ]", "", data$movie.gross)
data$budget <- gsub("[^0-9///' ]", "", data$movie.budget)
data$generes <- gsub("[^0-9A-Za-z,///' ]", "", data$generes)

# Transforming the gross and budget column to numeric
data[, c(8)] <- sapply(data[, c(8)], as.numeric)
data[, c(7)] <- sapply(data[, c(7)], as.numeric)

splitgenre <- strsplit(as.character(data$generes), ',') #Splitting multiple genres of the same movie
#Converting genres into single column
unlistgenre <- data.frame(genre=unlist(splitgenre),
                          gross=rep(data$movie.gross, sapply(splitgenre, FUN=length)),
                          budget=rep(data$movie.budget, sapply(splitgenre, FUN=length))) 
#Converting genres into single column

trimgenre <- data.frame(genre=stri_trim(unlistgenre$genre),gross=unlistgenre$gross,budget=unlistgenre$budget) #Removing spaces

write.csv(trimgenre, 'GenreGross.csv', row.names = F) #Saving output in csv
data$gross <- gsub("[^0-9///' ]", "", data$movie.gross)
data$budget <- gsub("[^0-9///' ]", "", data$movie.budget)
data$generes <- gsub("[^0-9A-Za-z,///' ]", "", data$generes)

# Transforming the gross and budget column to numeric
data[, c(8)] <- sapply(data[, c(8)], as.numeric)
data[, c(7)] <- sapply(data[, c(7)], as.numeric)

splitgenre <- strsplit(as.character(data$generes), ',') #Splitting multiple genres of the same movie
#Converting genres into single column
unlistgenre <- data.frame(genre=unlist(splitgenre),
                          gross=rep(data$movie.gross, sapply(splitgenre, FUN=length)),
                          budget=rep(data$movie.budget, sapply(splitgenre, FUN=length))) 
#Converting genres into single column

trimgenre <- data.frame(genre=stri_trim(unlistgenre$genre),gross=unlistgenre$gross,budget=unlistgenre$budget) #Removing spaces

write.csv(trimgenre, 'GenreGross.csv', row.names = F) #Saving output in csv

# Removing the special characters from gross and budget
data$gross <- gsub("[^0-9///' ]", "", data$movie.gross)
data$budget <- gsub("[^0-9///' ]", "", data$movie.budget)
data$generes <- gsub("[^0-9A-Za-z,///' ]", "", data$generes)

# Transforming the gross and budget column to numeric
data[, c(8)] <- sapply(data[, c(8)], as.numeric)
data[, c(7)] <- sapply(data[, c(7)], as.numeric)

splitstars <- strsplit(as.character(data$movie.stars), ',') #Splitting multiple genres of the same movie
unliststars <- data.frame(stars=unlist(splitstars),
                          movie.gross=rep(data$movie.gross, sapply(splitstars, FUN=length)),
                          movie.budget=rep(data$movie.budget, sapply(splitstars, FUN=length))) #Converting genres into single column
#Removing spaces
trimstars <- data.frame(stars=stri_trim(unliststars$stars),movie.gross=unliststars$movie.gross,movie.budget=unliststars$movie.budget) 


write.csv(trimstars, 'StarsGross.csv', row.names = F) #Saving output in csv

Summarization using Excel sheet for hypothesis number 1

Interestingly, from ROI perspective, Musical, Romance and Horror movies come on top but they aren’t amongst the top in Gross and Budget.
Summarization of hypothesis