---
Title: "IMDB_Analysis Group Assignment - Jarvis Team"
Output: html_document
 
SNO     STUDENT NAME                  STUDENT ID    STUDENT CONTACT_ID
1         Sai Ravi Kumar Rayapudi       71620061      sai_rayapudi_2016@cba.isb.edu 
2         Sree Kashyap Addanki        71620072      sreekashyap_addanki_2016@cba.isb.edu 
3         Radhika Palisetti           71620054      Radhika_Palisetti_2016@cba.isb.edu  
4         Bharat Kumar Bathula        71620014      bharat_bathula_2016@cba.isb.edu
---

R Markdown

Explanation About R Markdown:

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

IMDB_Analysis Group Assignment - Code Starts From Here

Libraries Required for our Code & Analysis are given here below

library(“rvest”) library(“XML”) library(“xml2”)

Below Code Reads the URL that has 250 Movies listed and we are reading that to a variable

url = “http://www.imdb.com/chart/top?ref_=nv_wl_img_3” page = read_html(url) page movie.nodes = html_nodes(page,‘.titleColumn a’)

Now movie.nodes contains list of top rated 250 movies in the IMDB website

Check one node

xmlTreeParse(movie.nodes[[1]])

We get information about the below 3 variables movie.link, movie.cast, movie.name

movie.link contains first url, as we started with Tree Parsing from the first Node

movie.link = sapply(html_attrs(movie.nodes),[[,‘href’) movie.link = paste0(“http://www.imdb.com”,movie.link) movie.cast = sapply(html_attrs(movie.nodes),[[,‘title’) movie.cast movie.link movie.name = html_text(movie.nodes) movie.name

Here we are Cleaning the Data For year Data

year = gsub(“)”,“”, # Removing ) gsub(“\(”,“”, # Removing ( html_text( # get text of HTML node
html_nodes(page,‘.secondaryInfo’) ))) year

Here rating.nodes will now contain 250 ratings of the above movies

rating.nodes = html_nodes(page,‘.imdbRating’) rating.nodes ### Check One node xmlTreeParse(rating.nodes[[1]]) ### Correct the node rating.nodes = html_nodes(page,‘.imdbRating strong’) rating.nodes

Here votes will now contain Votes of the above movies

votes = as.numeric(gsub(‘,’,‘’, gsub(’ user ratings’,‘’, gsub(’.*?based on ‘,’‘, sapply(html_attrs(rating.nodes),[[,’title’) ))) votes rating = as.numeric(html_text(rating.nodes)) rating

Here top250 will now contain all the required information in a data frame

top250 = data.frame(movie.name, movie.cast, movie.link,year,votes,rating) head(top250)

Here “IMDB Top 250.csv” will now contain all the required information in a CSV form and will write to

write.csv(top250,‘IMDB Top 250.csv’, row.names = F)

library (dplyr) library(stringr)

create objects for the below variables, so that we can use them in the code

moviename<-c() director <-c() stars <- c() taglines <- c() genres <- c() storyline <- c() boxOfficeBudget <- c() boxOfficeGross <- c() length(data$movie.name) ### we have 250 movies , so we should iterate 250 times, every time system will click every link and scrap the data from that page for(i in 1:250){ moviename[i]<- movie.name[i]

                 library("rvest")
                 library("XML")
       library("xml2")

page will contain the each link for every iteration. so we can get below data.

                 page= read_html(movie.link[i])

we can get director information from belowcode

                 director[i] = page %>% html_nodes('.summary_text+ .credit_summary_item ') %>%  html_text(trim = TRUE) %>%paste(collapse=" ,")

# stars data
stars[i] = page %>% html_nodes(‘.credit_summary_item~ .credit_summary_item+ .credit_summary_item’) %>% html_text() %>% paste(collapse=" ,“) tl = page %>% html_nodes(‘#titleStoryLine .txt-block:nth-child(8)’) %>% html_text(trim = TRUE) if(length(tl) != 0) { taglines[i] = gsub(”“,”“, gsub(”Taglines:“,”“,tl)) }

                 gs1 = page %>% html_nodes('.see-more.canwrap~ .canwrap') %>% html_text(trim = TRUE)
                 genres[i] = gsub("[|]", ",", gsub("\n","", gsub("Genres:","",gs1)))
                 
                 sLine1 = page %>% html_nodes("#titleStoryLine p") %>% html_text(trim = TRUE) 
                 storyline[i] = gsub("\n","", gsub("Written by\n", "Written by ",sLine1))
                 
       boxOfficeBudget1=  page %>% html_nodes('.txt-block:nth-child(11)') %>% html_text(trim = TRUE) %>%  paste(collapse=" ")
               if(length(boxOfficeBudget1!= 0)) {
                   library(stringr)
                   boxOfficeBudget[i]=  page %>% html_nodes('.txt-block:nth-child(11)') %>% html_text(trim = TRUE) %>%  paste(collapse=" ")
                   boxOfficeBudget[i] = gsub("Budget:", "", boxOfficeBudget[i]) %>% trimws(which="both")
                 }
                 else {
                   boxOfficeBudget[i] = NA
                 }

                 boxOfficeGross1 = page %>%  html_nodes('#titleDetails .txt-block:nth-child(12)') %>%  html_text(trim = TRUE) %>% paste(collapse=" ")
                 if(length(boxOfficeGross!= 0)) {
     library(stringr)
                  
                 boxOfficeGross[i] = page %>%  html_nodes('#titleDetails .txt-block:nth-child(12)') %>%  html_text(trim = TRUE) %>% paste(collapse=" ")
      boxOfficeGross[i] = gsub("Gross:", "", boxOfficeGross[i]) %>% trimws(which="both")
                 }
                
                 else {
                   boxOfficeGross[i] = NA
                 }
               }
               movieInfo1 = data.frame(moviename, director, stars, taglines, genres, storyline, boxOfficeBudget, boxOfficeGross, movie.cast, movie.link,year,votes,rating)
     #movieInfo1 is the dataframe contains all the related movie information          
               movieInfo1
               write.csv(movieInfo1,'IMDBTop250.movieInfo3.csv', row.names = F)

Task1: Find all movies that released between 1996 and 1998 (both years including).

install.packages(dplyr) library(dplyr) data <-read.csv(“IMDBTop250.movieInfo3.csv”) filter(data, data$year>=1996 & data$year <= 1998) data

Task2:Read in the URL’s imdb page and scrape the following information:

Director, stars, Taglines, Genres, (partial) storyline, Box office budget and box office gross.

data <-read.csv(“IMDBTop250.movieInfo3.csv”) data

Task3:Make a dataframe out of these variables as columns with movie name being the first variable.

movieInfo1 = data.frame(moviename, director, stars, taglines, genres, storyline, boxOfficeBudget, boxOfficeGross, movie.cast, movie.link,year,votes,rating) write.csv(movieInfo1,‘IMDBTop250.movieInfo3.csv’, row.names = F)

Task4. Make a table movie-count versus Genres.

imdb <- read.csv(“IMDBTop250.movieInfo3.csv”, strip.white=TRUE) imdb$genres<-gsub("[[:space:]]", "",imdb$genres) imdb$boxOfficeGross <- gsub("\\$|,“,”“, imdb$boxOfficeGross) write.csv(imdb, file ="IMDB_final.csv" ) imdb_split <- separate_rows(imdb, genres, sep = ",") genre_count<-table(imdb_split$genres) genre_count Final_genre_table = as.data.frame(genre_count) Final_genre_table

Task5.Write a markdown doc with your code and explanation. See if you can storify your hypotheses.

summary of imdb

summary(imdb) ### structure of imdb str(imdb) library(ggplot2) library(plyr)

Created 6 decade bins for movies.

All the movies for which release yearis<1960 are in 1960 bin

imdb[“decade”] <- as.numeric(ifelse( imdb$year<1970, '1960',ifelse( imdb$year<1980, ‘1970’,ifelse( imdb$year<1990, '1980',ifelse( imdb$year<2000, ‘1990’,ifelse( imdb$year<2010, '2000',ifelse( imdb$year<2020, ‘2010’,NA)))))) ) ### imdb[“decade”] <- ifelse( imdb$year<1970, '1960',ifelse( imdb$year<1980, ‘1970’,ifelse( imdb$year<1990, '1980',ifelse( imdb$year<2000, ‘1990’,ifelse( imdb$year<2010, '2000',ifelse( imdb$year<2020, ‘2010’,NA))))))

freq <- count(imdb, ‘decade’) ### We have observed that many top movies are released before 1970 qplot(imdb$decade, geom=“histogram”, xlab=“decade”, ylab=“Frequency of movies”)

We have observed that, gross collection for movies slightly decreased in 1970s than previous #years.There is a steady increase in gross collection after this

gross_collections_per_decade <- aggregate(as.numeric(imdb$boxOfficeGross) ~ imdb$decade, imdb, sum)

plot(gross_collections_per_decade) names(gross_collections_per_decade) <- c(‘decade’, ‘collection’)

qqplot(gross_collections_per_decade$decade, gross_collections_per_decade$collection)

gross_collections_per_year <- aggregate(as.numeric(imdb$boxOfficeGross) ~ imdb$year, imdb, sum)

names(gross_collections_per_year) <- c(‘year’, ‘collection’)

While Above Documents are just for MarkDown Reference - - Our Main Hypothesis and its supporting graphs along with the explanantion is given in a seperate Word Docment. Kindly Refer that. With Regards - Jarvis Team.