---
  Title: "IMDB_Analysis Group Assignment - Jarvis Team"
Output: html_document
SNO     STUDENT NAME                  STUDENT ID    STUDENT CONTACT_ID
1         Sai Ravi Kumar Rayapudi       71620061      sai_rayapudi_2016@cba.isb.edu 
2         Sree Kashyap Addanki        71620072      sreekashyap_addanki_2016@cba.isb.edu 
3         Radhika Palisetti           71620054      Radhika_Palisetti_2016@cba.isb.edu  
4         Bharat Kumar Bathula        71620014      bharat_bathula_2016@cba.isb.edu
---

R Markdown

Explanation About R Markdown:

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

IMDB_Analysis Group Assignment - Code Starts From Here

Libraries Required for our Code & Analysis are given here below

library(“rvest”) library(“XML”) library(“xml2”)

Below Code Reads the URL that has 250 Movies listed and we are reading that to a variable

url = “http://www.imdb.com/chart/top?ref_=nv_wl_img_3” page = read_html(url) page movie.nodes = html_nodes(page,‘.titleColumn a’)

Now movie.nodes contains list of top rated 250 movies in the IMDB website

Check one node

xmlTreeParse(movie.nodes[[1]])

We get information about the below 3 variables movie.link, movie.cast, movie.name

movie.link contains first url, as we started with Tree Parsing from the first Node

movie.link = sapply(html_attrs(movie.nodes),[[,‘href’) movie.link = paste0(“http://www.imdb.com”,movie.link) movie.cast = sapply(html_attrs(movie.nodes),[[,‘title’) movie.cast movie.link movie.name = html_text(movie.nodes) movie.name

Here we are Cleaning the Data For year Data

year = gsub(“)”,“”, # Removing ) gsub(“\(”,“”, # Removing ( html_text( # get text of HTML node
html_nodes(page,‘.secondaryInfo’) ))) year

Here rating.nodes will now contain 250 ratings of the above movies

rating.nodes = html_nodes(page,‘.imdbRating’) rating.nodes ### Check One node xmlTreeParse(rating.nodes[[1]]) ### Correct the node rating.nodes = html_nodes(page,‘.imdbRating strong’) rating.nodes

Here votes will now contain Votes of the above movies

votes = as.numeric(gsub(‘,’,‘’, gsub(’ user ratings’,‘’, gsub(’.*?based on ‘,’‘, sapply(html_attrs(rating.nodes),[[,’title’) ))) votes rating = as.numeric(html_text(rating.nodes)) rating

Here top250 will now contain all the required information in a data frame

               top250 = data.frame(movie.name, movie.cast, movie.link,year,votes,rating)
               head(top250)

Here “IMDB Top 250.csv” will now contain all the required information in a CSV form and will write to

               write.csv(top250,'IMDB Top 250.csv', row.names = F)
               
               
               library (dplyr)
               library(stringr)

create objects for the below variables, so that we can use them in the code

               moviename<-c()
               Directors <-c()
               Stars <- c()
               TagLines <- c()
               genres <- c()
               StoryLine <- c()
               BoxOfficeBudget <- c()
               BoxOfficeGross <- c()
               length(data$movie.name)
               ### we have 250 movies , so we should iterate 250 times, every time system will click every link and scrap the data from that page
               for(i in 1:2){
                 moviename[i] <- movie.name[i]
                 library("rvest")
                 library("XML")
                 library("xml2")

page will contain the each link for every iteration. so we can get below data.

                 page= read_html(movie.link[i])

we can get director information from belowcode

                 Directors[i] = page %>% html_nodes('.summary_text+ .credit_summary_item ') %>%  html_text(trim = TRUE)

stars data

                 Stars[i] = page %>% html_nodes('.credit_summary_item~ .credit_summary_item+ .credit_summary_item') %>% html_text(trim = TRUE) 
                 Tag = page %>% html_nodes('#titleStoryLine .txt-block:nth-child(8)') %>% html_text(trim = TRUE) 
                 if(length(Tag) != 0) {
                   TagLines [i] = gsub("\n","", gsub("Taglines:","",Tag))
                 }
                 
                 
                 
                 
                 GSsub = page %>% html_nodes('.see-more.canwrap~ .canwrap') %>% html_text(trim = TRUE)
                 genres[i] = gsub("\n", ",", gsub("[|]","", gsub("[Genres:]","",GSsub)))
                 
                 story = page %>% html_nodes("#titleStoryLine p") %>% html_text(trim = TRUE) 
                 StoryLine[i] = gsub("\n","", gsub("[Written by\n]", "[Written by] ",story))
                 
                 boxOfficeBudget1=  page %>% html_nodes('.txt-block:nth-child(11)') %>% html_text(trim = TRUE)                  
                 
                 library(stringr)
                 BoxOfficeBudget[i]=  page %>% html_nodes('.txt-block:nth-child(11)') %>% html_text(trim = TRUE)
                 BoxOfficeBudget[i] = gsub("[Budget:]", "", boxOfficeBudget1) %>% trimws(which="both")
                 
                 
                 
                 boxOfficeGross1 = page %>%  html_nodes('#titleDetails .txt-block:nth-child(12)') %>%  html_text(trim = TRUE) %>%paste(collapse=" ")
                 if(length(boxOfficeGross1)!= 0) {
                   library(stringr)
                   
                   BoxOfficeGross[i] = page %>%   html_nodes('#titleDetails.txt-block:nth-child(12)') %>%  html_text(trim = TRUE)%>%paste(collapse=" ")
                   BoxOfficeGross[i] = gsub("[Gross:]", "", boxOfficeGross1 ) %>% trimws(which="both")
                 }else{
                   BoxOfficeGross[i]=NA
                 }
                 
                 
               }
               Moviedata = data.frame(moviename, Directors , Stars , TagLines , genres , StoryLine , BoxOfficeBudget  ,BoxOfficeGross, movie.cast, movie.link,year,votes,rating)

Moviedata is the dataframe contains all the related movie information

               Moviedata
               
               write.csv(Moviedata,'IMDBTop250.movieInfo3.csv', row.names = F)

Task1: Find all movies that released between 1996 and 1998 (both years including).

               install.packages(dplyr)
               library(dplyr)
               data <-read.csv("IMDBTop250.movieInfo3.csv")
               filter(data, data$year>=1996 & data$year <= 1998)
               data

Task2:Read in the URL’s imdb page and scrape the following information:

Director, stars, Taglines, Genres, (partial) storyline, Box office budget and box office gross.

               data <-read.csv("IMDBTop250.movieInfo3.csv")
               data

Task3:Make a dataframe out of these variables as columns with movie name being the first variable.

               movieInfo1 = data.frame(moviename, director, stars, taglines, genres, storyline, boxOfficeBudget, boxOfficeGross, movie.cast, movie.link,year,votes,rating)
               write.csv(movieInfo1,'IMDBTop250.movieInfo3.csv', row.names = F)

Task4. Make a table movie-count versus Genres.

               imdb <- read.csv("IMDBTop250.movieInfo3.csv", strip.white=TRUE)
               imdb$genres<-gsub("[[:space:]]", "",imdb$genres)
               imdb$boxOfficeGross <- gsub("\\$|,","", imdb$boxOfficeGross) 
               write.csv(imdb, file ="IMDB_final.csv" )
               imdb_split <- separate_rows(imdb, genres, sep = ",")
               genre_count<-table(imdb_split$genres)
               genre_count
               Final_genre_table = as.data.frame(genre_count)
               Final_genre_table

Task5.Write a markdown doc with your code and explanation. See if you can storify your hypotheses.

summary of imdb

               summary(imdb)

structure of imdb

               str(imdb)
               library(ggplot2)
               library(plyr)

Created 6 decade bins for movies.

All the movies for which release yearis<1960 are in 1960 bin

               imdb["decade"] <- as.numeric(ifelse( imdb$year<1970, '1960',ifelse( imdb$year<1980, '1970',ifelse( imdb$year<1990, '1980',ifelse( imdb$year<2000, '1990',ifelse( imdb$year<2010, '2000',ifelse( imdb$year<2020, '2010',NA)))))) )

imdb[“decade”] <- ifelse( imdb\(year<1970, '1960',ifelse( imdb\)year<1980, ‘1970’,ifelse( imdb\(year<1990, '1980',ifelse( imdb\)year<2000, ‘1990’,ifelse( imdb\(year<2010, '2000',ifelse( imdb\)year<2020, ‘2010’,NA))))))

               freq <- count(imdb, 'decade')

We have observed that many top movies are released before 1970

               qplot(imdb$decade, geom="histogram",
                     xlab="decade",
                     ylab="Frequency of movies")

We have observed that, gross collection for movies slightly decreased in 1970s than previous #years.There is a steady increase in gross collection after this

               gross_collections_per_decade <- aggregate(as.numeric(imdb$boxOfficeGross) ~ imdb$decade, imdb, sum)
               
               plot(gross_collections_per_decade)
               names(gross_collections_per_decade) <- c('decade', 'collection')
               
               qqplot(gross_collections_per_decade$decade, gross_collections_per_decade$collection)
               
               gross_collections_per_year <- aggregate(as.numeric(imdb$boxOfficeGross) ~ imdb$year, imdb, sum)
               
               names(gross_collections_per_year) <- c('year', 'collection')

While Above Documents are just for MarkDown Reference - - Our Main Hypothesis and its supporting graphs along with the explanantion is given in a seperate Word Docment. Kindly Refer that. With Regards - Jarvis Team.