PROBLEM STATEMENT
The following code will result URLs for the top 250 movies on IMDB. Once this is done, your real homework task begins. This is given to us as a part of Data Collection Group Assignment 2.
library("rvest")
## Warning: package 'rvest' was built under R version 3.3.1
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.3.1
library("XML")
## Warning: package 'XML' was built under R version 3.3.1
##
## Attaching package: 'XML'
## The following object is masked from 'package:rvest':
##
## xml
url = "http://www.imdb.com/chart/top?ref_=nv_wl_img_3"
page = read_html(url)
movie.nodes = html_nodes(page,'.titleColumn a')
#movie.nodes
# Check one node
#xmlTreeParse(movie.nodes[[1]])
movie.link = sapply(html_attrs(movie.nodes),`[[`,'href')
movie.link = paste0("http://www.imdb.com",movie.link)
movie.cast = sapply(html_attrs(movie.nodes),`[[`,'title')
movie.name = html_text(movie.nodes)
#movie.link
year = gsub(")","", # Removing )
gsub("\\(","", # Removing (
html_text( # get text of HTML node
html_nodes(page,'.secondaryInfo')
)))
rating.nodes = html_nodes(page,'.imdbRating')
# Check One node
#xmlTreeParse(rating.nodes[[1]])
rating.nodes = html_nodes(page,'.imdbRating strong')
votes = as.numeric(gsub(',','',
gsub(' user ratings','',
gsub('.*?based on ','',
sapply(html_attrs(rating.nodes),`[[`,'title')
))))
rating = as.numeric(html_text(rating.nodes))
top250 = data.frame(movie.name, movie.cast, movie.link,year,votes,rating)
#head(top250)
View(top250)
TASK 1 - Find all movies that released between 1996 and 1998 (both years including).
Below piece of R code will fetch the movies between 1996 and 1998 from the dataset given above :
And print the final result
movie_96and98 <- top250[as.numeric(as.character(top250$year))>=1996 & as.numeric(as.character(top250$year))<=1998,]
View(movie_96and98)
TASK 2 AND 3 - Read in the URL’s imdb page and scrape the following information: Director, stars, Taglines, Genres, (partial) storyline, Box office budget and box office gross. And then Make a dataframe out of these variables as columns with movie name being the first variable.
Below piece of R code will go to each URL (extracted in above step) and scrap / crawl the html and find the following information from the html provide: Director stars Taglines Genres (partial) storyline Box office budget *box office gross
And print the final result in df_final data_frame
df_final = data.frame()
for(i in 1:nrow(top250))
{
url <- as.character(top250$movie.link[i])
moviename <- as.character(top250$movie.name[i])
page1 = read_html(url)
#director
director <- page1 %>%
html_nodes(xpath = '//*[@itemprop="director"]') %>%
html_text()
director <- gsub("[\r\n]", "", sub("\\s+$", "", director))
dt <- data.frame(director)
director <- ""
for (j in 1:nrow(dt))
{
director <- paste(director, dt[j,1])
}
#stars
stars <- page1 %>%
html_nodes(xpath = '//*[@itemprop="actors"]') %>%
html_text()
stars <- gsub("[\r\n]", "", sub("\\s+$", "", stars))
dt <- data.frame(stars)
#print (stars)
stars <- ""
for (j in 1:nrow(dt))
{
stars <- paste(stars, dt[j,1])
}
#genre
genre <- page1 %>%
html_nodes(xpath = '//*[@itemprop="genre"]') %>%
html_text()
dt <- data.frame(genre)
#print (genre)
genre <- ""
for (j in 1:nrow(dt))
{
if(length(grep("Genres:",dt[j,1]))>0)
{
break;
}
else
{
if (j > 1)
genre <- paste(genre, ",")
genre <- paste(genre, dt[j,1])
}
}
#storyline
storyline <- page1 %>%
html_nodes(xpath = '//*[@id="titleStoryLine"]') %>%
html_text()
iPosition <- gregexpr(pattern ='Written by\n',storyline)
storyline <- gsub("[\r\n]", "", sub("\\s+$", "", substr(storyline, 83, iPosition)))
#tagline
tagline <- page1 %>%
html_nodes(xpath = '//*[@class="txt-block"]') %>%
html_text()
dt <- data.frame(tagline)
tagline <- ""
for (j in 1:nrow(dt))
{
if(length(grep("Taglines:",dt[j,1]))>0)
{
tagline <- dt[j,1]
break;
}
}
tagline <- gsub("[\r\n]", "", sub("\\s+$", "", substr(tagline, 24, nchar(as.character(tagline)))))
budget <- ""
for (j in 1:nrow(dt))
{
if(length(grep("Budget:",dt[j,1]))>0)
{
budget <- dt[j, 1]
break;
}
}
budget <- gsub("[\r\n]", "", sub("\\s+", "", substr(budget, 23, nchar(as.character(budget)))))
gross <- ""
for (j in 1:nrow(dt))
{
if(length(grep("Gross:",dt[j,1]))>0)
{
gross <- dt[j,1]
break;
}
}
gross <- gsub("[\r\n]", "", sub("\\s+", "", substr(gross, 19, nchar(as.character(gross)))))
df <- data.frame(moviename, director, stars, tagline, genre, storyline, budget, gross)
df_final <- rbind(df_final,df)
}
View(df_final)
TASK 4 - Make a table movie-count versus Genres.
Below piece of R code will count the movie based on Genres. There could be a case where one movie can go under multiple Genre. So, we are spliting Genre and first getting unique Genre and then seeing whether each movie belong to particular Genre or not. If yes, then increase the count by 1.
And print the final result in dataset_MovieCount_Vs_Genres
genrelist_final = data.frame()
genre_list <- list()
iCount <- 1
for (i in 1:nrow(df_final))
{
str <- df_final$genre[i]
#print (str)
str1 <- as.character(str)
#print (str1)
s <- strsplit(str1, split = ",")
#print(s)
genrelist <- data.frame(unlist(s))
#print (genrelist)
if (i == 1)
{
genrelist_final <- rbind(genrelist_final, genrelist)
for (k in 1:nrow(genrelist_final))
{
tmpnamefinal <- sub("\\s+$", "", sub("^\\s+", "", as.character(genrelist_final[k,1])))
genre_list[iCount] <- tmpnamefinal
iCount <- iCount + 1
}
}
for (j in 1:nrow(genrelist))
{
genreexist <- FALSE
for (k in 1:length(genre_list))
{
tmpname <- sub("\\s+$", "", sub("^\\s+", "", as.character(genrelist[j,1])))
tmpnamefinal <- sub("\\s+$", "", sub("^\\s+", "", as.character(genre_list[k][1])))
if(length(grep(tmpname,tmpnamefinal)>0))
{
genreexist <- TRUE
break;
}
}
if (!genreexist)
{
genre_list[iCount] <- tmpname
iCount <- iCount + 1
}
}
}
#View(genre_list)
count_list <- list()
zCount <- 1
for (i in 1:length(genre_list))
{
tmp <- genre_list[i][1]
count_list[zCount] <- 0
for (i in 1:nrow(df_final))
{
str <- df_final$genre[i]
str1 <- as.character(str)
s <- strsplit(str1, split = ",")
genrelist <- data.frame(unlist(s))
for (j in 1:nrow(genrelist))
{
tmpname <- sub("\\s+$", "", sub("^\\s+", "", as.character(genrelist[j,1])))
tmpnamefinal <- sub("\\s+$", "", sub("^\\s+", "", as.character(tmp)))
if(length(grep(tmpname,tmpnamefinal)>0))
{
count_list[zCount] <- as.numeric(count_list[zCount]) + 1
}
}
}
zCount <- zCount + 1
}
#View(count_list)
dataset_MovieCount_Vs_Genres <- rbind(genre_list,count_list)
View(dataset_MovieCount_Vs_Genres)
Bonus points: See if you can come up with some interesting hypotheses. For example, you could hypothesize that “Action Genres occur significantly more often than Drama in the top-250 list.” Or that “Action movies gross higher than Romance movies in the top 250 list.” Etc.
From the data that we collected “MovieCount Versus Genres”, we can say that Maxiumn number of movies that are under top250 are “Drama” Genre and minimun is of “Music”. The “Crime” is taking 2 position in the list