library(plyr)
# this code basically installed packages if not installed already and load the mentioned packages
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load("rvest","XML","stringi")
#set working dir
setwd("C:\\Users\\ADMIN\\Google Drive\\DC\\DC_GA_2")
#url of IMDB where TOP 250 movies listed
url = "http://www.imdb.com/chart/top?ref_=nv_wl_img_3"
# read the html text
page = read_html(url)
#filter html data by css element
movie.nodes = html_nodes(page,'.titleColumn a')
#As we can see from above, movie nodes have three type of information. Movie link,
#Movie Name and Movie cast. So we will extract all the information stored in movie nodes and
#store them as separate vector for each field.
movie.link = sapply(html_attrs(movie.nodes),`[[`,'href')
movie.link =paste0("http://www.imdb.com",movie.link)
movie.cast = sapply(html_attrs(movie.nodes),`[[`,'title')
movie.name = html_text(movie.nodes)
#As we found movie nameâ css selector from SelectorGadget,
#we can find yearâ css selector and use rvest for scraping year information from the page
year = gsub(")","", gsub("\\(","", html_text(html_nodes(page,'.secondaryInfo'))))
# get rating node
rating.nodes = html_nodes(page,'.imdbRating')
rating.nodes = html_nodes(page,'.imdbRating strong')
votes = as.numeric(gsub(',','',
gsub(' user ratings','',
gsub('.*?based on ','',
sapply(html_attrs(rating.nodes),`[[`,'title')
))))
rating = as.numeric(html_text(rating.nodes))
# create one temp variable to hold number of rows in data set
rows <- length(movie.name)
#create data frame for same
#Alert: We are adding Rank column also to make it for cross verification
rank <- c(1:rows)
top250 = data.frame(rank,movie.name, movie.cast, movie.link,year,votes,rating)
#save as csv file
write.csv(top250,'IMDB Top 250.csv', row.names = F)
## rank movie.name
## 1 1 The Shawshank Redemption
## 2 2 The Godfather
## 3 3 The Godfather: Part II
## 4 4 The Dark Knight
## 5 5 12 Angry Men
## 6 6 Schindler's List
## movie.cast
## 1 Frank Darabont (dir.), Tim Robbins, Morgan Freeman
## 2 Francis Ford Coppola (dir.), Marlon Brando, Al Pacino
## 3 Francis Ford Coppola (dir.), Al Pacino, Robert De Niro
## 4 Christopher Nolan (dir.), Christian Bale, Heath Ledger
## 5 Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb
## 6 Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes
## movie.link
## 1 http://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1
## 2 http://www.imdb.com/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2
## 3 http://www.imdb.com/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_3
## 4 http://www.imdb.com/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_4
## 5 http://www.imdb.com/title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_5
## 6 http://www.imdb.com/title/tt0108052/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=1RGAWVKE37VCE5QF273P&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_6
## year votes rating
## 1 1994 1718456 9.2
## 2 1972 1174559 9.2
## 3 1974 804922 9.0
## 4 2008 1704632 8.9
## 5 1957 457572 8.9
## 6 1993 880006 8.9
# top250$year is a factor, not a numeric vector. Transform it first
top250$year <- as.numeric(as.character(top250$year))
movies1996_1998 <- subset(top250, top250$year > 1995 & top250$year < 1999 )
write.csv(movies1996_1998,'movies1996_1998.csv', row.names = F)
#created Vectors to hold column data
directors <- c(1:rows)
movie.stars <- c(1:rows)
taglines <- c(1:rows)
storyline <- c(1:rows)
generes <- c(1:rows)
movie.budget <- c(1:rows)
movie.gross <- c(1:rows)
# i is used as counter here and assign it to 1
i <- 1
# loop over all the movie links and extract required data
for(link in top250$movie.link) {
# get gross,budget and tagline earning of a movie
page <- read_html(link)
tagline <- ""
budget <- ""
gross <- ""
nodes <- html_nodes(page,".txt-block")
nodes <- html_text(nodes)
for (variable in nodes) {
variable
# if node is of type taglines
if (regexpr("Taglines:",variable)[1] != -1) {
# Warning: We are not extracting all taglines but only tagline which is available on movie page
# This can be done to create a link and call function read_html over it.
tagline <- variable
tagline <- substr(tagline,regexpr(':', tagline)[1]+2 , nchar(tagline))
tagline <- gsub("^ *|(?<= ) | *$", "", tagline, perl=T)
next
}
# if node is of type Budget
if (regexpr("Budget:",variable)[1] != -1) {
budget <- variable
budget <- substr(budget,regexpr(':', budget)[1]+1 , regexpr("\n\n",budget)[1]-1)
budget <- gsub("^ *|(?<= ) | *$", "", budget, perl=T)
next
}
# if node is of type Gross
if (regexpr("Gross:",variable)[1] != -1) {
gross <- variable
gross <- substr(gross,regexpr(':', gross)[1]+1 , regexpr("\n\n",gross)[1]-1)
gross <- gsub("^ *|(?<= ) | *$", "", gross, perl=T)
next
}
} # end of for loop to extract data for gross, budget and tagline
taglines[i] <- tagline
movie.gross[i] <- gross
movie.budget[i] <- budget
# get the genres of a given movie
node <- html_nodes(page,".canwrap")
canwrap <- html_text(node)
canwrap <- gsub("\n", "", canwrap)
pos <- regexpr(':', canwrap[3]) # Returns position of 1st match
genereList <- substr(canwrap[3], pos+1,nchar(canwrap[3]))
genereList <- gsub("^ *|(?<= ) | *$", "", genereList, perl=T)
genereList <- gsub("[|]", ",", genereList, perl=T)
generes[i] <- genereList
# get the stroyline
story <- gsub("^ *|(?<= ) | *$", "", canwrap[1], perl=T)
storyline[i] <- story
# get the director(s) name(s)
node <- html_nodes(page,".credit_summary_item")
casts <- html_text(node)
casts <- gsub("\n", "", casts)
dir <- strsplit(casts[1],":")
name <- gsub("^ *|(?<= ) | *$", "", dir, perl=T)
name <- strsplit(name, "\"")
directors[i] <- name[[1]][4]
# get the main star cast
dir <- strsplit(casts[3],":")
name <- gsub("^ *|(?<= ) | *$", "", dir, perl=T)
name <- gsub("[\"]", "", name, perl=T)
a <- nchar("c( Stars, ")
b <- nchar("|See full cast & crew » )")
c <- nchar(name)
name <- substr(name,a,c-b)
name <- gsub("^ *|(?<= ) | *$", "", name, perl=T)
movie.stars[i] <- name
#increment counter to get next movie link
i = i + 1
}
top250_movies_details = data.frame(top250$movie.name,directors, taglines, movie.stars,generes, storyline, movie.gross,movie.budget)
#rename some of the columns
top250_movies_details <- rename(top250_movies_details,c("top250.movie.name" = "movie.name"))
# wrting this information back into csv file
write.csv(top250_movies_details,'top250_movies_details.csv', row.names = F)
head(top250_movies_details)
## movie.name directors
## 1 The Shawshank Redemption Frank Darabont
## 2 The Godfather Francis Ford Coppola
## 3 The Godfather: Part II Francis Ford Coppola
## 4 The Dark Knight Christopher Nolan
## 5 12 Angry Men Sidney Lumet
## 6 Schindler's List Steven Spielberg
## taglines
## 1 Fear can hold you prisoner. Hope can set you free.
## 2 An offer you can't refuse. \n See more »\n \n
## 3
## 4 I Believe In Harvey Dent. \n See more »\n \n
## 5 They have twelve scraps of paper... Twelve chances to kill! \n See more »\n \n
## 6 Whoever saves one life, saves the world entire. \n See more »\n \n
## movie.stars
## 1 Tim Robbins, Morgan Freeman, Bob Gunton
## 2 Marlon Brando, Al Pacino, James Caan
## 3 Al Pacino, Robert De Niro, Robert Duvall
## 4 Christian Bale, Heath Ledger, Aaron Eckhart
## 5 Henry Fonda, Lee J. Cobb, Martin Balsam
## 6 Liam Neeson, Ralph Fiennes, Ben Kingsley
## generes
## 1 Crime , Drama
## 2 Crime , Drama
## 3 Crime , Drama
## 4 Action , Crime , Drama , Thriller
## 5 Crime , Drama
## 6 Biography , Drama , History
## storyline
## 1 Chronicles the experiences of a formerly successful banker as a prisoner in the gloomy jailhouse of Shawshank after being found guilty of a crime he did not commit. The film portrays the man's unique way of dealing with his new, torturous life; along the way he befriends a number of fellow prisoners, most notably a wise long-term inmate named Red. Written byJ-S-Golden
## 2 When the aging head of a famous crime family decides to transfer his position to one of his subalterns, a series of unfortunate events start happening to the family, and a war begins between all the well-known families leading to insolence, deportation, murder and revenge, and ends with the favorable successor being finally chosen. Written byJ. S. Golden
## 3 The continuing saga of the Corleone crime family tells the story of a young Vito Corleone growing up in Sicily and in 1910s New York; and follows Michael Corleone in the 1950s as he attempts to expand the family business into Las Vegas, Hollywood and Cuba. Written byKeith Loh <loh@sfu.ca>
## 4 Set within a year after the events of Batman Begins, Batman, Lieutenant James Gordon, and new district attorney Harvey Dent successfully begin to round up the criminals that plague Gotham City until a mysterious and sadistic criminal mastermind known only as the Joker appears in Gotham, creating a new wave of chaos. Batman's struggle against the Joker becomes deeply personal, forcing him to "confront everything he believes" and improve his technology to stop him. A love triangle develops between Bruce Wayne, Dent and Rachel Dawes. Written byLeon Lombardi
## 5 The defense and the prosecution have rested and the jury is filing into the jury room to decide if a young man is guilty or innocent of murdering his father. What begins as an open-and-shut case of murder soon becomes a detective story that presents a succession of clues creating doubt, and a mini-drama of each of the jurors' prejudices and preconceptions about the trial, the accused, and each other. Based on the play, all of the action takes place on the stage of the jury room. Written bypjk <PETESID@VNET.IBM.COM>
## 6 Oskar Schindler is a vainglorious and greedy German businessman who becomes an unlikely humanitarian amid the barbaric German Nazi reign when he feels compelled to turn his factory into a refuge for Jews. Based on the true story of Oskar Schindler who managed to save about 1100 Jews from being gassed at the Auschwitz concentration camp, it is a testament to the good in all of us. Written byHarald Mayr <marvin@bike.augusta.de>
## movie.gross movie.budget
## 1 $28,341,469 $25,000,000
## 2 $134,821,952 $6,000,000
## 3 $57,300,000 $13,000,000
## 4 $533,316,061 $185,000,000
## 5 $350,000
## 6 $96,067,179 $22,000,000
splitgenre <- strsplit(as.character(top250_movies_details$generes), ',') #Splitting multiple genres of the same movie
unlistgenre <- data.frame(genre=unlist(splitgenre)) #Converting genres into single column
genre <- stri_trim(unlistgenre$genre) #Removing spaces
genresummary <- data.frame(table(genre)) #Counting # of movies by each genre
write.csv(genresummary, 'GenreOfTop250Movies.csv', row.names = F) #Saving output in csv
| genre | Freq |
|---|---|
| Action | 36 |
| Adventure | 62 |
| Animation | 20 |
| Biography | 26 |
| Comedy | 42 |
| Crime | 56 |
| Drama | 176 |
| Family | 24 |
| Fantasy | 33 |
| Film-Noir | 7 |
| History | 17 |
| Horror | 5 |
| Music | 2 |
| Musical | 5 |
| Mystery | 36 |
| Romance | 24 |
| Sci-Fi | 31 |
| Sport | 7 |
| Thriller | 63 |
| War | 30 |
| Western | 10 |
Hypothesis : We wanted to check and understand how to different genres perform w.r.t. Gross, Budget and ROI (i.e. Gross/Budget) to simplify, we have considered only 166 movies from top 250 US movies that have both Budget and Gross available in USD,Also, due to lack of time, this particular analysis is partly done on R (data cleaning) and partly in Excel (summarization)
data <- top250_movies_details
data$gross <- gsub("[^0-9///' ]", "", data$movie.gross)
data$budget <- gsub("[^0-9///' ]", "", data$movie.budget)
data$generes <- gsub("[^0-9A-Za-z,///' ]", "", data$generes)
# Transforming the gross and budget column to numeric
data[, c(8)] <- sapply(data[, c(8)], as.numeric)
data[, c(7)] <- sapply(data[, c(7)], as.numeric)
splitgenre <- strsplit(as.character(data$generes), ',') #Splitting multiple genres of the same movie
#Converting genres into single column
unlistgenre <- data.frame(genre=unlist(splitgenre),
gross=rep(data$movie.gross, sapply(splitgenre, FUN=length)),
budget=rep(data$movie.budget, sapply(splitgenre, FUN=length)))
#Converting genres into single column
trimgenre <- data.frame(genre=stri_trim(unlistgenre$genre),gross=unlistgenre$gross,budget=unlistgenre$budget) #Removing spaces
write.csv(trimgenre, 'GenreGross.csv', row.names = F) #Saving output in csv
data$gross <- gsub("[^0-9///' ]", "", data$movie.gross)
data$budget <- gsub("[^0-9///' ]", "", data$movie.budget)
data$generes <- gsub("[^0-9A-Za-z,///' ]", "", data$generes)
# Transforming the gross and budget column to numeric
data[, c(8)] <- sapply(data[, c(8)], as.numeric)
data[, c(7)] <- sapply(data[, c(7)], as.numeric)
splitgenre <- strsplit(as.character(data$generes), ',') #Splitting multiple genres of the same movie
#Converting genres into single column
unlistgenre <- data.frame(genre=unlist(splitgenre),
gross=rep(data$movie.gross, sapply(splitgenre, FUN=length)),
budget=rep(data$movie.budget, sapply(splitgenre, FUN=length)))
#Converting genres into single column
trimgenre <- data.frame(genre=stri_trim(unlistgenre$genre),gross=unlistgenre$gross,budget=unlistgenre$budget) #Removing spaces
write.csv(trimgenre, 'GenreGross.csv', row.names = F) #Saving output in csv
# Removing the special characters from gross and budget
data$gross <- gsub("[^0-9///' ]", "", data$movie.gross)
data$budget <- gsub("[^0-9///' ]", "", data$movie.budget)
data$generes <- gsub("[^0-9A-Za-z,///' ]", "", data$generes)
# Transforming the gross and budget column to numeric
data[, c(8)] <- sapply(data[, c(8)], as.numeric)
data[, c(7)] <- sapply(data[, c(7)], as.numeric)
splitstars <- strsplit(as.character(data$movie.stars), ',') #Splitting multiple genres of the same movie
unliststars <- data.frame(stars=unlist(splitstars),
movie.gross=rep(data$movie.gross, sapply(splitstars, FUN=length)),
movie.budget=rep(data$movie.budget, sapply(splitstars, FUN=length))) #Converting genres into single column
#Removing spaces
trimstars <- data.frame(stars=stri_trim(unliststars$stars),movie.gross=unliststars$movie.gross,movie.budget=unliststars$movie.budget)
write.csv(trimstars, 'StarsGross.csv', row.names = F) #Saving output in csv
Interestingly, from ROI perspective, Musical, Romance and Horror movies come on top but they aren’t amongst the top in Gross and Budget.