R Code to extract Data of Top 250 movies from IMDB

library(plyr)
# this code basically installed packages if not installed already and load the mentioned packages 
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load("rvest","XML","stringi")

#set working dir
setwd("C:\\Users\\ADMIN\\Google Drive\\DC\\DC_GA_2")

#url of IMDB where TOP 250 movies listed
url = "http://www.imdb.com/chart/top?ref_=nv_wl_img_3"
# read the html text 
page = read_html(url)
#filter html data by css element 
movie.nodes = html_nodes(page,'.titleColumn a')
#As we can see from above, movie nodes have three type of information. Movie link, 
#Movie Name and Movie cast. So we will extract all the information stored in movie nodes and 
#store them as separate vector for each field.
movie.link = sapply(html_attrs(movie.nodes),`[[`,'href')
movie.link =paste0("http://www.imdb.com",movie.link)
movie.cast = sapply(html_attrs(movie.nodes),`[[`,'title')
movie.name = html_text(movie.nodes)

#As we found movie name’ css selector from SelectorGadget, 
#we can find year’ css selector and use rvest for scraping year information from the page
year = gsub(")","", gsub("\\(","", html_text(html_nodes(page,'.secondaryInfo'))))

# get rating node
rating.nodes = html_nodes(page,'.imdbRating')
rating.nodes = html_nodes(page,'.imdbRating strong')
votes = as.numeric(gsub(',','',
                        gsub(' user ratings','',
                             gsub('.*?based on ','',
                                  sapply(html_attrs(rating.nodes),`[[`,'title')
                             ))))

rating = as.numeric(html_text(rating.nodes))

# create one temp variable to hold number of rows in data set
rows <- length(movie.name)

#create data frame for same
#Alert: We are adding Rank column also to make it for cross verification
rank <- c(1:rows)
top250 = data.frame(rank,movie.name, movie.cast, movie.link,year,votes,rating)
#save as csv file
write.csv(top250,'IMDB Top 250.csv', row.names = F)

Snapshot of top 250 movies

##   rank               movie.name
## 1    1 The Shawshank Redemption
## 2    2            The Godfather
## 3    3   The Godfather: Part II
## 4    4          The Dark Knight
## 5    5             12 Angry Men
## 6    6         Schindler's List
##                                               movie.cast
## 1     Frank Darabont (dir.), Tim Robbins, Morgan Freeman
## 2  Francis Ford Coppola (dir.), Marlon Brando, Al Pacino
## 3 Francis Ford Coppola (dir.), Al Pacino, Robert De Niro
## 4 Christopher Nolan (dir.), Christian Bale, Heath Ledger
## 5          Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb
## 6    Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes
##                                                                                                                                                               movie.link
## 1 http://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1
## 2 http://www.imdb.com/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2
## 3 http://www.imdb.com/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_3
## 4 http://www.imdb.com/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_4
## 5 http://www.imdb.com/title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_5
## 6 http://www.imdb.com/title/tt0108052/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_6
##   year   votes rating
## 1 1994 1718456    9.2
## 2 1972 1174559    9.2
## 3 1974  804922    9.0
## 4 2008 1704632    8.9
## 5 1957  457572    8.9
## 6 1993  880006    8.9

Task 1: Find all movies that released between 1996 and 1998 (both years including)

# top250$year is a factor, not a numeric vector. Transform it first
top250$year <- as.numeric(as.character(top250$year))
movies1996_1998 <- subset(top250, top250$year > 1995 & top250$year < 1999 )
write.csv(movies1996_1998,'movies1996_1998.csv', row.names = F)

Movies that released during 1996-1998 time period

rank movie.name movie.cast movie.link year votes rating
26 26 La vita è bella Roberto Benigni (dir.), Roberto Benigni, Nicoletta Braschi http://www.imdb.com/title/tt0118799/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_26 1997 429117 8.6
29 29 Saving Private Ryan Steven Spielberg (dir.), Tom Hanks, Matt Damon http://www.imdb.com/title/tt0120815/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_29 1998 897416 8.5
31 31 American History X Tony Kaye (dir.), Edward Norton, Edward Furlong http://www.imdb.com/title/tt0120586/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_31 1998 795211 8.5
65 65 Mononoke-hime Hayao Miyazaki (dir.), Yôji Matsuda, Yuriko Ishida http://www.imdb.com/title/tt0119698/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_65 1997 226361 8.4
97 97 L.A. Confidential Curtis Hanson (dir.), Kevin Spacey, Russell Crowe http://www.imdb.com/title/tt0119488/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_97 1997 420345 8.3
107 107 Good Will Hunting Gus Van Sant (dir.), Robin Williams, Matt Damon http://www.imdb.com/title/tt0119217/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_107 1997 616586 8.2
118 118 Bacheha-Ye aseman Majid Majidi (dir.), Mohammad Amir Naji, Amir Farrokh Hashemian http://www.imdb.com/title/tt0118849/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_118 1997 29686 8.2
136 136 Lock, Stock and Two Smoking Barrels Guy Ritchie (dir.), Jason Flemyng, Dexter Fletcher http://www.imdb.com/title/tt0120735/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_136 1998 420939 8.2
151 151 The Big Lebowski Joel Coen (dir.), Jeff Bridges, John Goodman http://www.imdb.com/title/tt0118715/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_151 1998 546858 8.1
155 155 Trainspotting Danny Boyle (dir.), Ewan McGregor, Ewen Bremner http://www.imdb.com/title/tt0117951/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_155 1996 476883 8.1
157 157 Fargo Joel Coen (dir.), William H. Macy, Frances McDormand http://www.imdb.com/title/tt0116282/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_157 1996 461963 8.1
206 206 The Truman Show Peter Weir (dir.), Jim Carrey, Ed Harris http://www.imdb.com/title/tt0120382/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_206 1998 680829 8.0

Task 2: Scrape the following information: Director, stars,Taglines, Genres, (partial) storyline, Box office budget and box office gross.

#created Vectors to hold column data
directors <- c(1:rows)
movie.stars <- c(1:rows)
taglines <- c(1:rows)
storyline <- c(1:rows)
generes <- c(1:rows)
movie.budget <- c(1:rows)
movie.gross <- c(1:rows)

# i is used as counter here and assign it to 1
i <- 1
# loop over all the movie links and extract required data

for(link in top250$movie.link) {
  # get gross,budget and tagline earning of a movie
  page <- read_html(link)
  tagline <- ""
  budget <- ""
  gross <- ""
  nodes <- html_nodes(page,".txt-block")
  nodes <- html_text(nodes)
  for (variable in nodes) {
    variable
    # if node is of type taglines
    if (regexpr("Taglines:",variable)[1] != -1) {
      # Warning: We are not extracting all taglines but only tagline which is available on movie page
      # This can be done to create a link and call function read_html over it.
      tagline <- variable
      tagline <- substr(tagline,regexpr(':', tagline)[1]+2 , nchar(tagline))
      tagline <- gsub("^ *|(?<= ) | *$", "", tagline, perl=T)
      next
    }
    # if node is of type Budget
    if (regexpr("Budget:",variable)[1] != -1) {
      budget <- variable
      budget <-  substr(budget,regexpr(':', budget)[1]+1 , regexpr("\n\n",budget)[1]-1)
      budget <- gsub("^ *|(?<= ) | *$", "", budget, perl=T)
      next
    }
    # if node is of type Gross
    if (regexpr("Gross:",variable)[1] != -1) {
      gross <- variable
      gross <-  substr(gross,regexpr(':', gross)[1]+1 , regexpr("\n\n",gross)[1]-1)
      gross <- gsub("^ *|(?<= ) | *$", "", gross, perl=T)
      next
    }
  } # end of for loop to extract data for gross, budget and tagline
  taglines[i] <- tagline
  movie.gross[i] <- gross
  movie.budget[i] <- budget
  
  # get the genres of a given movie
  node <- html_nodes(page,".canwrap")
  canwrap <- html_text(node)
  canwrap <- gsub("\n", "", canwrap)
  pos <- regexpr(':', canwrap[3]) # Returns position of 1st match
  genereList <- substr(canwrap[3], pos+1,nchar(canwrap[3]))
  genereList <- gsub("^ *|(?<= ) | *$", "", genereList, perl=T)
  genereList <- gsub("[|]", ",", genereList, perl=T) 
  generes[i] <- genereList
  
  # get the stroyline
  story <-  gsub("^ *|(?<= ) | *$", "", canwrap[1], perl=T)
  storyline[i] <- story
  
  # get the director(s) name(s)
  node <- html_nodes(page,".credit_summary_item")
  casts <- html_text(node)
  casts <- gsub("\n", "", casts)
  dir <- strsplit(casts[1],":")
  name <- gsub("^ *|(?<= ) | *$", "", dir, perl=T)
  name <- strsplit(name, "\"")
  directors[i] <- name[[1]][4]
  
  # get the main star cast
  dir <- strsplit(casts[3],":")
  name <- gsub("^ *|(?<= ) | *$", "", dir, perl=T)
  name <- gsub("[\"]", "", name, perl=T)
  a <- nchar("c( Stars, ")
  b <- nchar("|See full cast & crew » )")
  c <- nchar(name)
  name <- substr(name,a,c-b)
  name <- gsub("^ *|(?<= ) | *$", "", name, perl=T)
  movie.stars[i] <- name

    #increment counter to get next movie link
  i = i + 1
}

Task 3: create data frame from extracted information

top250_movies_details = data.frame(top250$movie.name,directors, taglines, movie.stars,generes, storyline, movie.gross,movie.budget)
#rename some of the columns
top250_movies_details <- rename(top250_movies_details,c("top250.movie.name" = "movie.name"))
# wrting this information back into csv file
write.csv(top250_movies_details,'top250_movies_details.csv', row.names = F)

Display head of top 250 movies details

head(top250_movies_details)
##                 movie.name              directors
## 1 The Shawshank Redemption        Frank Darabont 
## 2            The Godfather  Francis Ford Coppola 
## 3   The Godfather: Part II  Francis Ford Coppola 
## 4          The Dark Knight     Christopher Nolan 
## 5             12 Angry Men          Sidney Lumet 
## 6         Schindler's List      Steven Spielberg 
##                                                                         taglines
## 1                             Fear can hold you prisoner. Hope can set you free.
## 2                                  An offer you can't refuse. \n See more »\n \n
## 3                                                                               
## 4                                   I Believe In Harvey Dent. \n See more »\n \n
## 5 They have twelve scraps of paper... Twelve chances to kill! \n See more »\n \n
## 6             Whoever saves one life, saves the world entire. \n See more »\n \n
##                                   movie.stars
## 1     Tim Robbins, Morgan Freeman, Bob Gunton
## 2        Marlon Brando, Al Pacino, James Caan
## 3    Al Pacino, Robert De Niro, Robert Duvall
## 4 Christian Bale, Heath Ledger, Aaron Eckhart
## 5     Henry Fonda, Lee J. Cobb, Martin Balsam
## 6    Liam Neeson, Ralph Fiennes, Ben Kingsley
##                             generes
## 1                     Crime , Drama
## 2                     Crime , Drama
## 3                     Crime , Drama
## 4 Action , Crime , Drama , Thriller
## 5                     Crime , Drama
## 6       Biography , Drama , History
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          storyline
## 1                                                                                                                                                                                               Chronicles the experiences of a formerly successful banker as a prisoner in the gloomy jailhouse of Shawshank after being found guilty of a crime he did not commit. The film portrays the man's unique way of dealing with his new, torturous life; along the way he befriends a number of fellow prisoners, most notably a wise long-term inmate named Red. Written byJ-S-Golden
## 2                                                                                                                                                                                                             When the aging head of a famous crime family decides to transfer his position to one of his subalterns, a series of unfortunate events start happening to the family, and a war begins between all the well-known families leading to insolence, deportation, murder and revenge, and ends with the favorable successor being finally chosen. Written byJ. S. Golden
## 3                                                                                                                                                                                                                                                                                The continuing saga of the Corleone crime family tells the story of a young Vito Corleone growing up in Sicily and in 1910s New York; and follows Michael Corleone in the 1950s as he attempts to expand the family business into Las Vegas, Hollywood and Cuba. Written byKeith Loh <loh@sfu.ca>
## 4 Set within a year after the events of Batman Begins, Batman, Lieutenant James Gordon, and new district attorney Harvey Dent successfully begin to round up the criminals that plague Gotham City until a mysterious and sadistic criminal mastermind known only as the Joker appears in Gotham, creating a new wave of chaos. Batman's struggle against the Joker becomes deeply personal, forcing him to "confront everything he believes" and improve his technology to stop him. A love triangle develops between Bruce Wayne, Dent and Rachel Dawes. Written byLeon Lombardi
## 5                                         The defense and the prosecution have rested and the jury is filing into the jury room to decide if a young man is guilty or innocent of murdering his father. What begins as an open-and-shut case of murder soon becomes a detective story that presents a succession of clues creating doubt, and a mini-drama of each of the jurors' prejudices and preconceptions about the trial, the accused, and each other. Based on the play, all of the action takes place on the stage of the jury room. Written bypjk <PETESID@VNET.IBM.COM>
## 6                                                                                                                                    Oskar Schindler is a vainglorious and greedy German businessman who becomes an unlikely humanitarian amid the barbaric German Nazi reign when he feels compelled to turn his factory into a refuge for Jews. Based on the true story of Oskar Schindler who managed to save about 1100 Jews from being gassed at the Auschwitz concentration camp, it is a testament to the good in all of us. Written byHarald Mayr <marvin@bike.augusta.de>
##    movie.gross movie.budget
## 1  $28,341,469  $25,000,000
## 2 $134,821,952   $6,000,000
## 3  $57,300,000  $13,000,000
## 4 $533,316,061 $185,000,000
## 5                  $350,000
## 6  $96,067,179  $22,000,000

Task 4 Make a table movie-count versus Genres.

splitgenre <- strsplit(as.character(top250_movies_details$generes), ',') #Splitting multiple genres of the same movie
unlistgenre <- data.frame(genre=unlist(splitgenre)) #Converting genres into single column
genre <- stri_trim(unlistgenre$genre) #Removing spaces
genresummary <- data.frame(table(genre)) #Counting # of movies by each genre
write.csv(genresummary, 'GenreOfTop250Movies.csv', row.names = F) #Saving output in csv

Frequency Table and Bar graph of Movies counts vs Genre

genre Freq
Action 36
Adventure 62
Animation 20
Biography 26
Comedy 42
Crime 56
Drama 176
Family 24
Fantasy 33
Film-Noir 7
History 17
Horror 5
Music 2
Musical 5
Mystery 36
Romance 24
Sci-Fi 31
Sport 7
Thriller 63
War 30
Western 10

Task 4a: Hypothesis testing

Hypothesis : We wanted to check and understand how to different genres perform w.r.t. Gross, Budget and ROI (i.e. Gross/Budget) to simplify, we have considered only 166 movies from top 250 US movies that have both Budget and Gross available in USD,Also, due to lack of time, this particular analysis is partly done on R (data cleaning) and partly in Excel (summarization)

data <- top250_movies_details
data$gross <- gsub("[^0-9///' ]", "", data$movie.gross)
data$budget <- gsub("[^0-9///' ]", "", data$movie.budget)
data$generes <- gsub("[^0-9A-Za-z,///' ]", "", data$generes)

# Transforming the gross and budget column to numeric
data[, c(8)] <- sapply(data[, c(8)], as.numeric)
data[, c(7)] <- sapply(data[, c(7)], as.numeric)

splitgenre <- strsplit(as.character(data$generes), ',') #Splitting multiple genres of the same movie
#Converting genres into single column
unlistgenre <- data.frame(genre=unlist(splitgenre),
                          gross=rep(data$movie.gross, sapply(splitgenre, FUN=length)),
                          budget=rep(data$movie.budget, sapply(splitgenre, FUN=length))) 
#Converting genres into single column

trimgenre <- data.frame(genre=stri_trim(unlistgenre$genre),gross=unlistgenre$gross,budget=unlistgenre$budget) #Removing spaces

write.csv(trimgenre, 'GenreGross.csv', row.names = F) #Saving output in csv
data$gross <- gsub("[^0-9///' ]", "", data$movie.gross)
data$budget <- gsub("[^0-9///' ]", "", data$movie.budget)
data$generes <- gsub("[^0-9A-Za-z,///' ]", "", data$generes)

# Transforming the gross and budget column to numeric
data[, c(8)] <- sapply(data[, c(8)], as.numeric)
data[, c(7)] <- sapply(data[, c(7)], as.numeric)

splitgenre <- strsplit(as.character(data$generes), ',') #Splitting multiple genres of the same movie
#Converting genres into single column
unlistgenre <- data.frame(genre=unlist(splitgenre),
                          gross=rep(data$movie.gross, sapply(splitgenre, FUN=length)),
                          budget=rep(data$movie.budget, sapply(splitgenre, FUN=length))) 
#Converting genres into single column

trimgenre <- data.frame(genre=stri_trim(unlistgenre$genre),gross=unlistgenre$gross,budget=unlistgenre$budget) #Removing spaces

write.csv(trimgenre, 'GenreGross.csv', row.names = F) #Saving output in csv

# Removing the special characters from gross and budget
data$gross <- gsub("[^0-9///' ]", "", data$movie.gross)
data$budget <- gsub("[^0-9///' ]", "", data$movie.budget)
data$generes <- gsub("[^0-9A-Za-z,///' ]", "", data$generes)

# Transforming the gross and budget column to numeric
data[, c(8)] <- sapply(data[, c(8)], as.numeric)
data[, c(7)] <- sapply(data[, c(7)], as.numeric)

splitstars <- strsplit(as.character(data$movie.stars), ',') #Splitting multiple genres of the same movie
unliststars <- data.frame(stars=unlist(splitstars),
                          movie.gross=rep(data$movie.gross, sapply(splitstars, FUN=length)),
                          movie.budget=rep(data$movie.budget, sapply(splitstars, FUN=length))) #Converting genres into single column
#Removing spaces
trimstars <- data.frame(stars=stri_trim(unliststars$stars),movie.gross=unliststars$movie.gross,movie.budget=unliststars$movie.budget) 


write.csv(trimstars, 'StarsGross.csv', row.names = F) #Saving output in csv

Summarization using Excel sheet for hypothesis number 1

Interestingly, from ROI perspective, Musical, Romance and Horror movies come on top but they aren’t amongst the top in Gross and Budget.
Summarization of hypothesis