R Markdown

Submitted by Debanjan Paul(71620021), Reema Malhotra(71620057) and Yogesh Chandra Tewari(71620096)

Fetching Basic Data from IMDB Top 250 list

library("rvest")
## Loading required package: xml2
library("XML")
## 
## Attaching package: 'XML'
## The following object is masked from 'package:rvest':
## 
##     xml
# IMDB Top 250 Movies
url = "http://www.imdb.com/chart/top?ref_=nv_wl_img_3"
page = read_html(url)

movie.nodes = html_nodes(page,'.titleColumn a')
# Check one node
#xmlTreeParse(movie.nodes[[1]])
movie.link = sapply(html_attrs(movie.nodes),`[[`,'href')
movie.link = paste0("http://www.imdb.com", movie.link)
movie.cast = sapply(html_attrs(movie.nodes),`[[`,'title')
movie.name = html_text(movie.nodes)
year = gsub(")","",gsub("\\(","",html_text(html_nodes(page,'.secondaryInfo'))))

top250 = data.frame(movie.name, year, movie.cast, movie.link)
head(top250)
##                 movie.name year
## 1 The Shawshank Redemption 1994
## 2            The Godfather 1972
## 3   The Godfather: Part II 1974
## 4          The Dark Knight 2008
## 5             12 Angry Men 1957
## 6         Schindler's List 1993
##                                               movie.cast
## 1     Frank Darabont (dir.), Tim Robbins, Morgan Freeman
## 2  Francis Ford Coppola (dir.), Marlon Brando, Al Pacino
## 3 Francis Ford Coppola (dir.), Al Pacino, Robert De Niro
## 4 Christopher Nolan (dir.), Christian Bale, Heath Ledger
## 5          Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb
## 6    Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes
##                                                                                                                                                               movie.link
## 1 http://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1
## 2 http://www.imdb.com/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2
## 3 http://www.imdb.com/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_3
## 4 http://www.imdb.com/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_4
## 5 http://www.imdb.com/title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_5
## 6 http://www.imdb.com/title/tt0108052/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_6
mvurl <- data.frame(top250$movie.link)

movie.director<-list()
movie.tagline<-list()
movie.star<-list()
movie.genre<-list()
movie.storyline<-list()
movie.budget<-list()
movie.gross<-list()

x<-nrow(mvurl)
i=1

This function reads pages in slow internet connection. It keep on trying every 5 seconds.

readpage<-function(u)
{
  tryCatch(read_html(u), error = function(e) {
    Sys.sleep(5)
    readpage(url)
  }
  )
}
while(i<=x)
{
  url <- gsub("250 Levels: http://www.imdb.com/title/tt0012349/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0AC17525S9QK2DB09MS7&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_95 ...","",mvurl[i,1])
  page = readpage(url)
  movie.director[[i]] <- html_text(html_nodes(page,'.summary_text+ .credit_summary_item .itemprop'))
  movie.tagline[[i]] <- html_text(html_nodes(page, '#titleStoryLine .txt-block:nth-child(8)'))
  movie.star[[i]] <-html_text(html_nodes(page,'.credit_summary_item~ .credit_summary_item+ .credit_summary_item'))
  movie.genre[[i]] <- html_text(html_nodes(page,'.see-more.canwrap~ .canwrap a'))
  movie.storyline[[i]] <- html_text(html_nodes(page,'#titleStoryLine p'))

  ### Scrape Box Office budget and gross begin ###

  str<-html_text(html_nodes(page,'#titleDetails .txt-block:nth-child(13) , #titleDetails')) #read html text based on selector mentioned
  str<-gsub("\\s+"," ", str) #remove extra spaces
  rxb<-regexpr("Budget:( +)([A-Z$? ]+)([0-9,]+)",str)
  bob<-gsub("Budget: ","",substr(str,rxb,rxb+attr(rxb,"match.length")-1))
  rxg<-regexpr("Gross:( +)([A-Z$? ]+)([0-9,]+)",str)
  bog<-gsub("Gross: ","",substr(str,rxg,rxg+attr(rxg,"match.length")-1))
  movie.budget[[i]]<- unique(bob)
  movie.gross[[i]]<- unique(bog)

  ### Scrape Box Office budget and gross end ###

  paste0 (i)
  i<- i+1
}

Insert all scrapped data into top250 movies dataframe begin

top250$movie.director<-vapply(movie.director, paste, collapse = ", ", character(1L))
top250$movie.tagline<-vapply(movie.tagline, paste, collapse = ", ", character(1L))
top250$movie.star<-vapply(movie.star, paste, collapse = ", ", character(1L))
top250$movie.genre<-vapply(movie.genre, paste, collapse = ", ", character(1L))
top250$movie.storyline<-vapply(movie.storyline, paste, collapse = ", ", character(1L))
top250$movie.budget<-vapply(movie.budget, paste, collapse = ", ", character(1L))
top250$movie.gross<-vapply(movie.gross, paste, collapse = ", ", character(1L))

cleaning data begin

#movie.tagline character(0) needs replacements
y<-unique(top250$movie.tagline)
top250$movie.tagline<-ifelse(top250$movie.tagline=="","NA",top250$movie.tagline)

#movie.genre character(0) needs replacements
y<-unique(top250$movie.genre)
top250$movie.genre<-ifelse(top250$movie.genre=="","NA",top250$movie.genre)

#movie.budget character(0) needs replacements
y<-unique(top250$movie.budget)
top250$movie.budget<-ifelse(top250$movie.budget=="","NA",top250$movie.budget)

#movie.gross character(0) needs replacements
y<-unique(top250$movie.gross)
top250$movie.gross<-ifelse(top250$movie.gross=="","NA",top250$movie.gross)


#movie.tagline text cleaning
top250$movie.tagline<-gsub("Taglines: |See more (.*)+","",gsub("\\s+"," ",top250$movie.tagline))

#movie.star text cleaning
top250$movie.star<-gsub("Stars: |(\\| See full cast & crew)+ (.*)+","",gsub("\\s+"," ",top250$movie.star))

#movie.storyline text cleaning
top250$movie.storyline<-gsub("\\s+"," ",top250$movie.storyline)

#movie.genre text cleaning
top250$movie.genre<-gsub("\\s+","",top250$movie.genre)

Rearrange columns

imdbtop250<-data.frame(top250$movie.name, top250$year, top250$movie.director, top250$movie.star, top250$movie.genre, top250$movie.tagline, top250$movie.storyline, top250$movie.budget, top250$movie.gross, top250$movie.link)
colnames(imdbtop250)<-c("MovieName","ReleaseYear","Movie.Director","Movie.Star","Movie.Genre","Movie.Tagline","Movie.Storyline","Movie.Budget","Movie.Gross","Movie.Link")

what we found

head(imdbtop250)
##                  MovieName ReleaseYear       Movie.Director
## 1 The Shawshank Redemption        1994       Frank Darabont
## 2            The Godfather        1972 Francis Ford Coppola
## 3   The Godfather: Part II        1974 Francis Ford Coppola
## 4          The Dark Knight        2008    Christopher Nolan
## 5             12 Angry Men        1957         Sidney Lumet
## 6         Schindler's List        1993     Steven Spielberg
##                                      Movie.Star
## 1      Tim Robbins, Morgan Freeman, Bob Gunton 
## 2         Marlon Brando, Al Pacino, James Caan 
## 3     Al Pacino, Robert De Niro, Robert Duvall 
## 4  Christian Bale, Heath Ledger, Aaron Eckhart 
## 5      Henry Fonda, Lee J. Cobb, Martin Balsam 
## 6     Liam Neeson, Ralph Fiennes, Ben Kingsley 
##                   Movie.Genre
## 1                 Crime,Drama
## 2                 Crime,Drama
## 3                 Crime,Drama
## 4 Action,Crime,Drama,Thriller
## 5                 Crime,Drama
## 6     Biography,Drama,History
##                                          Movie.Tagline
## 1  Fear can hold you prisoner. Hope can set you free. 
## 2                       The Godfather is now a movie. 
## 3                                                   NA
## 4                   Welcome to a world without rules. 
## 5      ...it explodes like twelve sticks of dynamite! 
## 6     Whoever saves one life, saves the world entire. 
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Movie.Storyline
## 1                                                                                                                                                                                                Chronicles the experiences of a formerly successful banker as a prisoner in the gloomy jailhouse of Shawshank after being found guilty of a crime he did not commit. The film portrays the man's unique way of dealing with his new, torturous life; along the way he befriends a number of fellow prisoners, most notably a wise long-term inmate named Red. Written by J-S-Golden 
## 2                                                                                                                                                                                                              When the aging head of a famous crime family decides to transfer his position to one of his subalterns, a series of unfortunate events start happening to the family, and a war begins between all the well-known families leading to insolence, deportation, murder and revenge, and ends with the favorable successor being finally chosen. Written by J. S. Golden 
## 3                                                                                                                                                                                                                                                                                 The continuing saga of the Corleone crime family tells the story of a young Vito Corleone growing up in Sicily and in 1910s New York; and follows Michael Corleone in the 1950s as he attempts to expand the family business into Las Vegas, Hollywood and Cuba. Written by Keith Loh <loh@sfu.ca> 
## 4  Set within a year after the events of Batman Begins, Batman, Lieutenant James Gordon, and new district attorney Harvey Dent successfully begin to round up the criminals that plague Gotham City until a mysterious and sadistic criminal mastermind known only as the Joker appears in Gotham, creating a new wave of chaos. Batman's struggle against the Joker becomes deeply personal, forcing him to "confront everything he believes" and improve his technology to stop him. A love triangle develops between Bruce Wayne, Dent and Rachel Dawes. Written by Leon Lombardi 
## 5                                          The defense and the prosecution have rested and the jury is filing into the jury room to decide if a young man is guilty or innocent of murdering his father. What begins as an open-and-shut case of murder soon becomes a detective story that presents a succession of clues creating doubt, and a mini-drama of each of the jurors' prejudices and preconceptions about the trial, the accused, and each other. Based on the play, all of the action takes place on the stage of the jury room. Written by pjk <PETESID@VNET.IBM.COM> 
## 6                                                                                                                                     Oskar Schindler is a vainglorious and greedy German businessman who becomes an unlikely humanitarian amid the barbaric German Nazi reign when he feels compelled to turn his factory into a refuge for Jews. Based on the true story of Oskar Schindler who managed to save about 1100 Jews from being gassed at the Auschwitz concentration camp, it is a testament to the good in all of us. Written by Harald Mayr <marvin@bike.augusta.de> 
##     Movie.Budget  Movie.Gross
## 1    $25,000,000  $28,341,469
## 2   $6,000,000,  $134,821,952
## 3    $13,000,000  $57,300,000
## 4 $185,000,000,  $533,316,061
## 5       $350,000           NA
## 6  $22,000,000,   $96,067,179
##                                                                                                                                                               Movie.Link
## 1 http://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1
## 2 http://www.imdb.com/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2
## 3 http://www.imdb.com/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_3
## 4 http://www.imdb.com/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_4
## 5 http://www.imdb.com/title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_5
## 6 http://www.imdb.com/title/tt0108052/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_6

Task 1 : Movies between 1996 and 1998 from IMDB Top 250

movies_1996_1998<-top250[year >= 1996 & year <= 1998,]
write.csv(movies_1996_1998,'Movies_1996_1998.csv', row.names = F)

what we found

movies_1996_1998
##                              movie.name year
## 26                      La vita è bella 1997
## 29                  Saving Private Ryan 1998
## 31                   American History X 1998
## 65                        Mononoke-hime 1997
## 97                    L.A. Confidential 1997
## 107                   Good Will Hunting 1997
## 118                   Bacheha-Ye aseman 1997
## 136 Lock, Stock and Two Smoking Barrels 1998
## 151                    The Big Lebowski 1998
## 155                       Trainspotting 1996
## 157                               Fargo 1996
## 206                     The Truman Show 1998
##                                                          movie.cast
## 26       Roberto Benigni (dir.), Roberto Benigni, Nicoletta Braschi
## 29                   Steven Spielberg (dir.), Tom Hanks, Matt Damon
## 31                  Tony Kaye (dir.), Edward Norton, Edward Furlong
## 65               Hayao Miyazaki (dir.), Yôji Matsuda, Yuriko Ishida
## 97                Curtis Hanson (dir.), Kevin Spacey, Russell Crowe
## 107                 Gus Van Sant (dir.), Robin Williams, Matt Damon
## 118 Majid Majidi (dir.), Mohammad Amir Naji, Amir Farrokh Hashemian
## 136              Guy Ritchie (dir.), Jason Flemyng, Dexter Fletcher
## 151                    Joel Coen (dir.), Jeff Bridges, John Goodman
## 155                 Danny Boyle (dir.), Ewan McGregor, Ewen Bremner
## 157            Joel Coen (dir.), William H. Macy, Frances McDormand
## 206                        Peter Weir (dir.), Jim Carrey, Ed Harris
##                                                                                                                                                                   movie.link
## 26   http://www.imdb.com/title/tt0118799/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_26
## 29   http://www.imdb.com/title/tt0120815/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_29
## 31   http://www.imdb.com/title/tt0120586/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_31
## 65   http://www.imdb.com/title/tt0119698/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_65
## 97   http://www.imdb.com/title/tt0119488/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_97
## 107 http://www.imdb.com/title/tt0119217/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_107
## 118 http://www.imdb.com/title/tt0118849/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_118
## 136 http://www.imdb.com/title/tt0120735/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_136
## 151 http://www.imdb.com/title/tt0118715/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_151
## 155 http://www.imdb.com/title/tt0117951/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_155
## 157 http://www.imdb.com/title/tt0116282/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_157
## 206 http://www.imdb.com/title/tt0120382/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_206
##            movie.director
## 26        Roberto Benigni
## 29       Steven Spielberg
## 31              Tony Kaye
## 65         Hayao Miyazaki
## 97          Curtis Hanson
## 107          Gus Van Sant
## 118          Majid Majidi
## 136           Guy Ritchie
## 151 Joel Coen, Ethan Coen
## 155           Danny Boyle
## 157 Joel Coen, Ethan Coen
## 206            Peter Weir
##                                                                                                                                                                              movie.tagline
## 26                                                                                                           An unforgettable fable that proves love, family and imagination conquer all. 
## 29                                                                                 In the Last Great Invasion of the Last Great War, The Greatest Danger for Eight Men was Saving... One. 
## 31                                                                                                                                                             Violence as a way of life. 
## 65                                                                                                                             The Fate Of The World Rests On The Courage Of One Warrior. 
## 97                                                         It's a crime saga that will shock you. It's a mystery that will keep you guessing. It's a thriller that will keep you riveted. 
## 107                                                                                                          Some people can never believe in themselves, until someone believes in them. 
## 118                                                                                                                                            A Little Secret...Their Biggest Adventure! 
## 136                                                                                                                                                   A Disgrace to Criminals Everywhere. 
## 151                                                                                                                  (Israel, translated from Hebrew): Lebowski: Not a man, a way of life 
## 155  Choose life. Choose a job. Choose a starter home. Choose dental insurance, leisure wear and matching luggage. Choose your future. But why would anyone want to do a thing like that? 
## 157                                                                                                                                          An ordinary place, an extraordinary thriller 
## 206                                                                                                                                                               The Story Of A Lifetime 
##                                                       movie.star
## 26        Roberto Benigni, Nicoletta Braschi, Giorgio Cantarini 
## 29                          Tom Hanks, Matt Damon, Tom Sizemore 
## 31              Edward Norton, Edward Furlong, Beverly D'Angelo 
## 65                     Yôji Matsuda, Yuriko Ishida, Yûko Tanaka 
## 97                      Kevin Spacey, Russell Crowe, Guy Pearce 
## 107                     Robin Williams, Matt Damon, Ben Affleck 
## 118  Mohammad Amir Naji, Amir Farrokh Hashemian, Bahare Seddiqi 
## 136                  Jason Flemyng, Dexter Fletcher, Nick Moran 
## 151                  Jeff Bridges, John Goodman, Julianne Moore 
## 155               Ewan McGregor, Ewen Bremner, Jonny Lee Miller 
## 157           William H. Macy, Frances McDormand, Steve Buscemi 
## 206                         Jim Carrey, Ed Harris, Laura Linney 
##                      movie.genre
## 26              Comedy,Drama,War
## 29              Action,Drama,War
## 31                   Crime,Drama
## 65   Animation,Adventure,Fantasy
## 97  Crime,Drama,Mystery,Thriller
## 107                        Drama
## 118                 Drama,Family
## 136                 Comedy,Crime
## 151                 Comedy,Crime
## 155                        Drama
## 157         Crime,Drama,Thriller
## 206          Comedy,Drama,Sci-Fi
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       movie.storyline
## 26                                                                                                                                                                                                                                                                                                                                                               In 1930s Italy, a carefree Jewish book keeper named Guido starts a fairy tale life by courting and marrying a lovely woman from a nearby city. Guido and his wife have a son and live happily together until the occupation of Italy by German forces. In an attempt to hold his family together and help his son survive the horrors of a Jewish Concentration Camp, Guido imagines that the Holocaust is a game and that the grand prize for winning is a tank. Written by Anthony Hughes <husnock31@hotmail.com> 
## 29                                                                                                                                                                                         Opening with the Allied invasion of Normandy on 6 June 1944, members of the 2nd Ranger Battalion under Cpt. Miller fight ashore to secure a beachhead. Amidst the fighting, two brothers are killed in action. Earlier in New Guinea, a third brother is KIA. Their mother, Mrs. Ryan, is to receive all three of the grave telegrams on the same day. The United States Army Chief of Staff, George C. Marshall, is given an opportunity to alleviate some of her grief when he learns of a fourth brother, Private James Ryan, and decides to send out 8 men (Cpt. Miller and select members from 2nd Rangers) to find him and bring him back home to his mother... Written by J.Zelman 
## 31                                                                                                                                                                                                                                                                                             Derek Vineyard is paroled after serving 3 years in prison for killing two thugs who tried to break into/steal his truck. Through his brother, Danny Vineyard's narration, we learn that before going to prison, Derek was a skinhead and the leader of a violent white supremacist gang that committed acts of racial crime throughout L.A. and his actions greatly influenced Danny. Reformed and fresh out of prison, Derek severs contact with the gang and becomes determined to keep Danny from going down the same violent path as he did. Written by Nitesh D.(nmxpa7@msn.com) 
## 65                                                                                                                                                                                                                          While protecting his village from rampaging boar-god/demon, a confident young warrior, Ashitaka, is stricken by a deadly curse. To save his life, he must journey to the forests of the west. Once there, he's embroiled in a fierce campaign that humans were waging on the forest. The ambitious Lady Eboshi and her loyal clan use their guns against the gods of the forest and a brave young woman, Princess Mononoke, who was raised by a wolf-god. Ashitaka sees the good in both sides and tries to stem the flood of blood. This is met be animosity by both sides as they each see him as supporting the enemy. Written by Christopher Taguchi 
## 97                                                                                                                                                                                                1950's Los Angeles is the seedy backdrop for this intricate noir-ish tale of police corruption and Hollywood sleaze. Three very different cops are all after the truth, each in their own style: Ed Exley, the golden boy of the police force, willing to do almost anything to get ahead, except sell out; Bud White, ready to break the rules to seek justice, but barely able to keep his raging violence under control; and Jack Vincennes, always looking for celebrity and a quick buck until his conscience drives him to join Exley and White down the one-way path to find the truth behind the dark world of L.A. crime. Written by Greg Bole <bole@life.bio.sunysb.edu> 
## 107                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       A touching tale of a wayward young man who struggles to find his identity, living in a world where he can solve any problem, except the one brewing deep within himself, until one day he meets his soul mate who opens his mind and his heart. Written by Dima & Danielle 
## 118                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 Zahra's shoes are gone; her older brother Ali lost them. They are poor, there are no shoes for Zahra until they come up with an idea: they will share one pair of shoes, Ali's. School awaits. Will the plan succeed? Written by Eileen Berdon <eberdon@aol.com> 
## 136                                                                                                                                                                                                                                                                                                                                                                                                             Four Jack-the-lads find themselves heavily - seriously heavily - in debt to an East End hard man and his enforcers after a crooked card game. Overhearing their neighbours in the next flat plotting to hold up a group of out-of-their-depth drug growers, our heros decide to stitch up the robbers in turn. In a way the confusion really starts when a pair of antique double-barrelled shotguns go missing in a completely different scam. Written by Anonymous 
## 151                                                                                                                                                                                                                                                                                                                                When "The Dude" Lebowski is mistaken for a millionaire Lebowski, two thugs urinate on his rug to coerce him into paying a debt he knows nothing about. While attempting to gain recompense for the ruined rug from his wealthy counterpart, he accepts a one-time job with high pay-off. He enlists the help of his bowling buddy, Walter, a gun-toting Jewish-convert with anger issues. Deception leads to more trouble, and it soon seems that everyone from porn empire tycoons to nihilists want something from The Dude. Written by J. Lake 
## 155                                                                                                                                                                                                                                                                                                                                                                                            A wild, freeform, Rabelaisian trip through the darkest recesses of Edinburgh low-life, focusing on Mark Renton and his attempt to give up his heroin habit, and how the latter affects his relationship with family and friends: Sean Connery wannabe Sick Boy, dimbulb Spud, psycho Begbie, 14-year-old girlfriend Diane, and clean-cut athlete Tommy, who's never touched drugs but can't help being curious about them... Written by Michael Brooke <michael@everyman.demon.co.uk> 
## 157  Jerry works in his father-in-law's car dealership and has gotten himself in financial problems. He tries various schemes to come up with money needed for a reason that is never really explained. It has to be assumed that his huge embezzlement of money from the dealership is about to be discovered by father-in-law. When all else falls through, plans he set in motion earlier for two men to kidnap his wife for ransom to be paid by her wealthy father (who doesn't seem to have the time of day for son-in-law). From the moment of the kidnapping, things go wrong and what was supposed to be a non-violent affair turns bloody with more blood added by the minute. Jerry is upset at the bloodshed, which turns loose a pregnant sheriff from Brainerd, MN who is tenacious in attempting to solve the three murders in her jurisdiction. Written by Anonymous 
## 206                                                                                                                                                                                                                                                                                                                                                                           In this movie, Truman is a man whose life is a fake one... The place he lives is in fact a big studio with hidden cameras everywhere, and all his friends and people around him, are actors who play their roles in the most popular TV-series in the world: The Truman Show. Truman thinks that he is an ordinary man with an ordinary life and has no idea about how he is exploited. Until one day... he finds out everything. Will he react? Written by Chris Makrozahopoulos <makzax@hotmail.com> 
##          movie.budget  movie.gross
## 26      $20,000,000,   $57,598,247
## 29      $70,000,000,  $216,119,491
## 31       $7,500,000,    $6,712,241
## 65  JPY 2,400,000,000   $2,298,191
## 97      $35,000,000,   $64,604,977
## 107     $10,000,000,  $138,339,411
## 118        $180,000,      $925,402
## 136                NA   $3,650,677
## 151       $15,000,000  $17,439,163
## 155      $3,500,000,   $16,501,785
## 157      $7,000,000,   $25,882,374
## 206     $60,000,000,  $125,603,360
Click to download the output Movies 1996 - 1998 From IMDB Top 250

Task 2 : Scrape Director, stars, Taglines, Genres, (partial) storyline, Box office budget and box office gross

write.csv(imdbtop250,'IMDB_Top_250.csv', row.names = F)

what we found

head(imdbtop250)
##                  MovieName ReleaseYear       Movie.Director
## 1 The Shawshank Redemption        1994       Frank Darabont
## 2            The Godfather        1972 Francis Ford Coppola
## 3   The Godfather: Part II        1974 Francis Ford Coppola
## 4          The Dark Knight        2008    Christopher Nolan
## 5             12 Angry Men        1957         Sidney Lumet
## 6         Schindler's List        1993     Steven Spielberg
##                                      Movie.Star
## 1      Tim Robbins, Morgan Freeman, Bob Gunton 
## 2         Marlon Brando, Al Pacino, James Caan 
## 3     Al Pacino, Robert De Niro, Robert Duvall 
## 4  Christian Bale, Heath Ledger, Aaron Eckhart 
## 5      Henry Fonda, Lee J. Cobb, Martin Balsam 
## 6     Liam Neeson, Ralph Fiennes, Ben Kingsley 
##                   Movie.Genre
## 1                 Crime,Drama
## 2                 Crime,Drama
## 3                 Crime,Drama
## 4 Action,Crime,Drama,Thriller
## 5                 Crime,Drama
## 6     Biography,Drama,History
##                                          Movie.Tagline
## 1  Fear can hold you prisoner. Hope can set you free. 
## 2                       The Godfather is now a movie. 
## 3                                                   NA
## 4                   Welcome to a world without rules. 
## 5      ...it explodes like twelve sticks of dynamite! 
## 6     Whoever saves one life, saves the world entire. 
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Movie.Storyline
## 1                                                                                                                                                                                                Chronicles the experiences of a formerly successful banker as a prisoner in the gloomy jailhouse of Shawshank after being found guilty of a crime he did not commit. The film portrays the man's unique way of dealing with his new, torturous life; along the way he befriends a number of fellow prisoners, most notably a wise long-term inmate named Red. Written by J-S-Golden 
## 2                                                                                                                                                                                                              When the aging head of a famous crime family decides to transfer his position to one of his subalterns, a series of unfortunate events start happening to the family, and a war begins between all the well-known families leading to insolence, deportation, murder and revenge, and ends with the favorable successor being finally chosen. Written by J. S. Golden 
## 3                                                                                                                                                                                                                                                                                 The continuing saga of the Corleone crime family tells the story of a young Vito Corleone growing up in Sicily and in 1910s New York; and follows Michael Corleone in the 1950s as he attempts to expand the family business into Las Vegas, Hollywood and Cuba. Written by Keith Loh <loh@sfu.ca> 
## 4  Set within a year after the events of Batman Begins, Batman, Lieutenant James Gordon, and new district attorney Harvey Dent successfully begin to round up the criminals that plague Gotham City until a mysterious and sadistic criminal mastermind known only as the Joker appears in Gotham, creating a new wave of chaos. Batman's struggle against the Joker becomes deeply personal, forcing him to "confront everything he believes" and improve his technology to stop him. A love triangle develops between Bruce Wayne, Dent and Rachel Dawes. Written by Leon Lombardi 
## 5                                          The defense and the prosecution have rested and the jury is filing into the jury room to decide if a young man is guilty or innocent of murdering his father. What begins as an open-and-shut case of murder soon becomes a detective story that presents a succession of clues creating doubt, and a mini-drama of each of the jurors' prejudices and preconceptions about the trial, the accused, and each other. Based on the play, all of the action takes place on the stage of the jury room. Written by pjk <PETESID@VNET.IBM.COM> 
## 6                                                                                                                                     Oskar Schindler is a vainglorious and greedy German businessman who becomes an unlikely humanitarian amid the barbaric German Nazi reign when he feels compelled to turn his factory into a refuge for Jews. Based on the true story of Oskar Schindler who managed to save about 1100 Jews from being gassed at the Auschwitz concentration camp, it is a testament to the good in all of us. Written by Harald Mayr <marvin@bike.augusta.de> 
##     Movie.Budget  Movie.Gross
## 1    $25,000,000  $28,341,469
## 2   $6,000,000,  $134,821,952
## 3    $13,000,000  $57,300,000
## 4 $185,000,000,  $533,316,061
## 5       $350,000           NA
## 6  $22,000,000,   $96,067,179
##                                                                                                                                                               Movie.Link
## 1 http://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1
## 2 http://www.imdb.com/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2
## 3 http://www.imdb.com/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_3
## 4 http://www.imdb.com/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_4
## 5 http://www.imdb.com/title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_5
## 6 http://www.imdb.com/title/tt0108052/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=0R7MD7TVA6C9WRZHYWCM&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_6
Click to download the output IMDB Top 250

Task 3 : Movie vs. Genre

genre<-data.frame(Movie=character(),Crime=integer(),Drama=integer(),Action=integer(),Thriller=integer(),Biography=integer(),History=integer(),Adventure=integer(),Fantasy=integer(),Western=integer(),SciFi=integer(),Comedy=integer(),Mystery=integer(),Family=integer(),War=integer(),Animation=integer(),Romance=integer(),Horror=integer(),Music=integer(),Musical=integer(),FilmNoir=integer(),Sport=integer(),stringsAsFactors=FALSE)

for(r in 1:250){
  genre[r,1]<-movie.name[r]
  len<-length(gsub("\\s","",unlist(movie.genre[r])))
  while(len>=1)
  {
    c<-grep(gsub("\\s|-","",unlist(movie.genre[r]))[len],colnames(genre))
    genre[r,c]<-1
    len<-len-1
  }
}  

write.csv(genre,'genre.csv', row.names = F)

what we found

Click to download the output Genre
head(genre)
##                      Movie Crime Drama Action Thriller Biography History
## 1 The Shawshank Redemption     1     1     NA       NA        NA      NA
## 2            The Godfather     1     1     NA       NA        NA      NA
## 3   The Godfather: Part II     1     1     NA       NA        NA      NA
## 4          The Dark Knight     1     1      1        1        NA      NA
## 5             12 Angry Men     1     1     NA       NA        NA      NA
## 6         Schindler's List    NA     1     NA       NA         1       1
##   Adventure Fantasy Western SciFi Comedy Mystery Family War Animation
## 1        NA      NA      NA    NA     NA      NA     NA  NA        NA
## 2        NA      NA      NA    NA     NA      NA     NA  NA        NA
## 3        NA      NA      NA    NA     NA      NA     NA  NA        NA
## 4        NA      NA      NA    NA     NA      NA     NA  NA        NA
## 5        NA      NA      NA    NA     NA      NA     NA  NA        NA
## 6        NA      NA      NA    NA     NA      NA     NA  NA        NA
##   Romance Horror Music Musical FilmNoir Sport
## 1      NA     NA    NA      NA       NA    NA
## 2      NA     NA    NA      NA       NA    NA
## 3      NA     NA    NA      NA       NA    NA
## 4      NA     NA    NA      NA       NA    NA
## 5      NA     NA    NA      NA       NA    NA
## 6      NA     NA    NA      NA       NA    NA

Hypothesis

colSums(genre[,2:22],na.rm=TRUE)
##     Crime     Drama    Action  Thriller Biography   History Adventure 
##        56       176        36        63        26        17        62 
##   Fantasy   Western     SciFi    Comedy   Mystery    Family       War 
##        33        10        31        42        36        24        30 
## Animation   Romance    Horror     Music   Musical  FilmNoir     Sport 
##        20        24         5         2         7         7         7

what we found

Drama, Thriller, Adventure, Crime are the top most genres on the IMDB Top 250 list. Music and Horror are the least most genres on the IMDB Top 250 list.

Lesson learnt

  1. HTML tags can be different for same selector. We faced this while scrapping movie gross and budget. >> We addressed this by selecting containing
    element (one level up) and then properly cleaning the data.
  2. Webscrapping by reading all 250 movies takes a good amount of time. A good internet connection is needed otherwise we may get timeout error while reading the data. >> We fixed this by introduing a piece of try catch block which will keep on trying every 5 seconds on failure.
  3. Cleaning is much required on overall dataset after webscrapping.