DC_2_Group_Assignment

IMDB Movie

PROBLEM STATEMENT

The following code will result URLs for the top 250 movies on IMDB. Once this is done, your real homework task begins. This is given to us as a part of Data Collection Group Assignment 2.

library("rvest")

## Warning: package 'rvest' was built under R version 3.3.1

## Loading required package: xml2

## Warning: package 'xml2' was built under R version 3.3.1

library("XML")

## Warning: package 'XML' was built under R version 3.3.1

## 
## Attaching package: 'XML'

## The following object is masked from 'package:rvest':
## 
##     xml

url = "http://www.imdb.com/chart/top?ref_=nv_wl_img_3"
page = read_html(url)

movie.nodes = html_nodes(page,'.titleColumn a')
#movie.nodes
# Check one node
#xmlTreeParse(movie.nodes[[1]])
movie.link = sapply(html_attrs(movie.nodes),`[[`,'href')
movie.link = paste0("http://www.imdb.com",movie.link)
movie.cast = sapply(html_attrs(movie.nodes),`[[`,'title')
movie.name = html_text(movie.nodes)
#movie.link
year = gsub(")","",                          # Removing )
            gsub("\\(","",                   # Removing (
                 html_text(                  # get text of HTML node  
                   html_nodes(page,'.secondaryInfo')
                 )))

rating.nodes = html_nodes(page,'.imdbRating')
# Check One node
#xmlTreeParse(rating.nodes[[1]])

rating.nodes = html_nodes(page,'.imdbRating strong')
votes = as.numeric(gsub(',','',
                        gsub(' user ratings','',
                             gsub('.*?based on ','',
                                  sapply(html_attrs(rating.nodes),`[[`,'title')
                             ))))
rating = as.numeric(html_text(rating.nodes))

top250 = data.frame(movie.name, movie.cast, movie.link,year,votes,rating)
#head(top250)
View(top250)

Movies Between 1996 AND 1998

TASK 1 - Find all movies that released between 1996 and 1998 (both years including).

Below piece of R code will fetch the movies between 1996 and 1998 from the dataset given above :

And print the final result

movie_96and98 <- top250[as.numeric(as.character(top250$year))>=1996 & as.numeric(as.character(top250$year))<=1998,]
View(movie_96and98)

Movie Details

TASK 2 AND 3 - Read in the URL’s imdb page and scrape the following information: Director, stars, Taglines, Genres, (partial) storyline, Box office budget and box office gross. And then Make a dataframe out of these variables as columns with movie name being the first variable.

Below piece of R code will go to each URL (extracted in above step) and scrap / crawl the html and find the following information from the html provide: Director stars Taglines Genres (partial) storyline Box office budget *box office gross

And print the final result in df_final data_frame

df_final = data.frame()

for(i in 1:nrow(top250))
{
  url <- as.character(top250$movie.link[i])

  moviename <- as.character(top250$movie.name[i])
  page1 = read_html(url)

  #director
   director <- page1 %>% 
    html_nodes(xpath = '//*[@itemprop="director"]') %>%
    html_text()
   director <- gsub("[\r\n]", "", sub("\\s+$", "", director))
   dt <- data.frame(director)
   director <- ""
   
   for (j in 1:nrow(dt))
   {
     director <- paste(director, dt[j,1])
   }
   
  #stars
  stars <- page1 %>% 
    html_nodes(xpath = '//*[@itemprop="actors"]') %>%
    html_text()
  stars <- gsub("[\r\n]", "", sub("\\s+$", "", stars))
  dt <- data.frame(stars)
  #print (stars)
  stars <- ""
  
  for (j in 1:nrow(dt))
  {
      stars <- paste(stars, dt[j,1])
  }
  
  #genre
  genre <- page1 %>% 
    html_nodes(xpath = '//*[@itemprop="genre"]') %>%
    html_text()
  dt <- data.frame(genre)
  #print (genre)
  genre <- ""
  
  for (j in 1:nrow(dt))
  {
    if(length(grep("Genres:",dt[j,1]))>0) 
    {
      break;
    }
    else
    {
      if (j > 1)
        genre <- paste(genre, ",")
      genre <- paste(genre, dt[j,1])
    }
  }

  #storyline
  storyline <- page1 %>% 
    html_nodes(xpath = '//*[@id="titleStoryLine"]') %>%
    html_text()
  iPosition <- gregexpr(pattern ='Written by\n',storyline)
  storyline <- gsub("[\r\n]", "", sub("\\s+$", "", substr(storyline, 83, iPosition)))

  #tagline
  tagline <- page1 %>% 
    html_nodes(xpath = '//*[@class="txt-block"]') %>%
    html_text()
  
  dt <- data.frame(tagline)
  tagline <- ""
  
  for (j in 1:nrow(dt))
  {
    if(length(grep("Taglines:",dt[j,1]))>0) 
    {
      tagline <- dt[j,1]
      break;
    }
  }
  tagline <- gsub("[\r\n]", "", sub("\\s+$", "", substr(tagline, 24, nchar(as.character(tagline)))))

  budget <- ""
  for (j in 1:nrow(dt))
  {
    if(length(grep("Budget:",dt[j,1]))>0) 
    {
      budget <- dt[j, 1]
      break;
    }
  }
  budget <- gsub("[\r\n]", "", sub("\\s+", "", substr(budget, 23, nchar(as.character(budget)))))

  gross <- ""
  for (j in 1:nrow(dt))
  {
    if(length(grep("Gross:",dt[j,1]))>0) 
    {
      gross <- dt[j,1]
      break;
    }
  }
  gross <- gsub("[\r\n]", "", sub("\\s+", "", substr(gross, 19, nchar(as.character(gross)))))
  
  df <- data.frame(moviename, director, stars, tagline, genre, storyline, budget, gross)
  df_final <- rbind(df_final,df)
}
View(df_final)

MovieCount Versus Genres

TASK 4 - Make a table movie-count versus Genres.

Below piece of R code will count the movie based on Genres. There could be a case where one movie can go under multiple Genre. So, we are spliting Genre and first getting unique Genre and then seeing whether each movie belong to particular Genre or not. If yes, then increase the count by 1.

And print the final result in dataset_MovieCount_Vs_Genres

genrelist_final = data.frame()
genre_list <- list()
iCount <- 1
for (i in 1:nrow(df_final))
{
  str <- df_final$genre[i]
  #print (str)
  str1 <- as.character(str)
  #print (str1)
  s <- strsplit(str1, split = ",")
  #print(s)
  genrelist <- data.frame(unlist(s))
  #print (genrelist)
  if (i == 1)
  {
    genrelist_final <- rbind(genrelist_final, genrelist)
    for (k in 1:nrow(genrelist_final))
    {
      tmpnamefinal <- sub("\\s+$", "", sub("^\\s+", "", as.character(genrelist_final[k,1])))
      genre_list[iCount] <- tmpnamefinal
      iCount <- iCount + 1
    }
  }
  for (j in 1:nrow(genrelist))
  {
    genreexist <- FALSE
    for (k in 1:length(genre_list))
    {
      tmpname <- sub("\\s+$", "", sub("^\\s+", "", as.character(genrelist[j,1])))
      tmpnamefinal <- sub("\\s+$", "", sub("^\\s+", "", as.character(genre_list[k][1])))
      if(length(grep(tmpname,tmpnamefinal)>0))
      {
        genreexist <- TRUE
        break;
      }
    }
    if (!genreexist)
    {
      genre_list[iCount] <- tmpname
      iCount <- iCount + 1
      
    }
  }
}

#View(genre_list)
count_list <- list()
zCount <- 1
for (i in 1:length(genre_list))
{
  tmp <- genre_list[i][1]
  count_list[zCount] <- 0
  for (i in 1:nrow(df_final))
  {
    str <- df_final$genre[i]
    str1 <- as.character(str)
    s <- strsplit(str1, split = ",")
    genrelist <- data.frame(unlist(s))
    
    for (j in 1:nrow(genrelist))
    {
        tmpname <- sub("\\s+$", "", sub("^\\s+", "", as.character(genrelist[j,1])))
        tmpnamefinal <- sub("\\s+$", "", sub("^\\s+", "", as.character(tmp)))
        if(length(grep(tmpname,tmpnamefinal)>0))
        {
          count_list[zCount] <- as.numeric(count_list[zCount]) + 1
        }
    }
  }
  zCount <- zCount + 1
}
#View(count_list)
dataset_MovieCount_Vs_Genres <- rbind(genre_list,count_list)
View(dataset_MovieCount_Vs_Genres)

Bonus points

Bonus points: See if you can come up with some interesting hypotheses. For example, you could hypothesize that “Action Genres occur significantly more often than Drama in the top-250 list.” Or that “Action movies gross higher than Romance movies in the top 250 list.” Etc.

From the data that we collected “MovieCount Versus Genres”, we can say that Maxiumn number of movies that are under top250 are “Drama” Genre and minimun is of “Music”. The “Crime” is taking 2 position in the list