---
Title: "IMDB_Analysis Group Assignment - Jarvis Team"
Output: html_document
SNO STUDENT NAME STUDENT ID STUDENT CONTACT_ID
1 Sai Ravi Kumar Rayapudi 71620061 sai_rayapudi_2016@cba.isb.edu
2 Sree Kashyap Addanki 71620072 sreekashyap_addanki_2016@cba.isb.edu
3 Radhika Palisetti 71620054 Radhika_Palisetti_2016@cba.isb.edu
4 Bharat Kumar Bathula 71620014 bharat_bathula_2016@cba.isb.edu
---
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(“rvest”) library(“XML”) library(“xml2”)
url = “http://www.imdb.com/chart/top?ref_=nv_wl_img_3” page = read_html(url) page movie.nodes = html_nodes(page,‘.titleColumn a’)
xmlTreeParse(movie.nodes[[1]])
movie.link = sapply(html_attrs(movie.nodes),[[,‘href’) movie.link = paste0(“http://www.imdb.com”,movie.link) movie.cast = sapply(html_attrs(movie.nodes),[[,‘title’) movie.cast movie.link movie.name = html_text(movie.nodes) movie.name
year = gsub(“)”,“”, # Removing ) gsub(“\(”,“”, # Removing ( html_text( # get text of HTML node
html_nodes(page,‘.secondaryInfo’) ))) year
rating.nodes = html_nodes(page,‘.imdbRating’) rating.nodes ### Check One node xmlTreeParse(rating.nodes[[1]]) ### Correct the node rating.nodes = html_nodes(page,‘.imdbRating strong’) rating.nodes
votes = as.numeric(gsub(‘,’,‘’, gsub(’ user ratings’,‘’, gsub(’.*?based on ‘,’‘, sapply(html_attrs(rating.nodes),[[,’title’) ))) votes rating = as.numeric(html_text(rating.nodes)) rating
top250 = data.frame(movie.name, movie.cast, movie.link,year,votes,rating)
head(top250)
write.csv(top250,'IMDB Top 250.csv', row.names = F)
library (dplyr)
library(stringr)
moviename<-c()
Directors <-c()
Stars <- c()
TagLines <- c()
genres <- c()
StoryLine <- c()
BoxOfficeBudget <- c()
BoxOfficeGross <- c()
length(data$movie.name)
### we have 250 movies , so we should iterate 250 times, every time system will click every link and scrap the data from that page
for(i in 1:2){
moviename[i] <- movie.name[i]
library("rvest")
library("XML")
library("xml2")
page= read_html(movie.link[i])
Directors[i] = page %>% html_nodes('.summary_text+ .credit_summary_item ') %>% html_text(trim = TRUE)
Stars[i] = page %>% html_nodes('.credit_summary_item~ .credit_summary_item+ .credit_summary_item') %>% html_text(trim = TRUE)
Tag = page %>% html_nodes('#titleStoryLine .txt-block:nth-child(8)') %>% html_text(trim = TRUE)
if(length(Tag) != 0) {
TagLines [i] = gsub("\n","", gsub("Taglines:","",Tag))
}
GSsub = page %>% html_nodes('.see-more.canwrap~ .canwrap') %>% html_text(trim = TRUE)
genres[i] = gsub("\n", ",", gsub("[|]","", gsub("[Genres:]","",GSsub)))
story = page %>% html_nodes("#titleStoryLine p") %>% html_text(trim = TRUE)
StoryLine[i] = gsub("\n","", gsub("[Written by\n]", "[Written by] ",story))
boxOfficeBudget1= page %>% html_nodes('.txt-block:nth-child(11)') %>% html_text(trim = TRUE)
library(stringr)
BoxOfficeBudget[i]= page %>% html_nodes('.txt-block:nth-child(11)') %>% html_text(trim = TRUE)
BoxOfficeBudget[i] = gsub("[Budget:]", "", boxOfficeBudget1) %>% trimws(which="both")
boxOfficeGross1 = page %>% html_nodes('#titleDetails .txt-block:nth-child(12)') %>% html_text(trim = TRUE) %>%paste(collapse=" ")
if(length(boxOfficeGross1)!= 0) {
library(stringr)
BoxOfficeGross[i] = page %>% html_nodes('#titleDetails.txt-block:nth-child(12)') %>% html_text(trim = TRUE)%>%paste(collapse=" ")
BoxOfficeGross[i] = gsub("[Gross:]", "", boxOfficeGross1 ) %>% trimws(which="both")
}else{
BoxOfficeGross[i]=NA
}
}
Moviedata = data.frame(moviename, Directors , Stars , TagLines , genres , StoryLine , BoxOfficeBudget ,BoxOfficeGross, movie.cast, movie.link,year,votes,rating)
data <-read.csv("IMDBTop250.movieInfo3.csv")
data
movieInfo1 = data.frame(moviename, director, stars, taglines, genres, storyline, boxOfficeBudget, boxOfficeGross, movie.cast, movie.link,year,votes,rating)
write.csv(movieInfo1,'IMDBTop250.movieInfo3.csv', row.names = F)
imdb <- read.csv("IMDBTop250.movieInfo3.csv", strip.white=TRUE)
imdb$genres<-gsub("[[:space:]]", "",imdb$genres)
imdb$boxOfficeGross <- gsub("\\$|,","", imdb$boxOfficeGross)
write.csv(imdb, file ="IMDB_final.csv" )
imdb_split <- separate_rows(imdb, genres, sep = ",")
genre_count<-table(imdb_split$genres)
genre_count
Final_genre_table = as.data.frame(genre_count)
Final_genre_table
summary(imdb)
str(imdb)
library(ggplot2)
library(plyr)
imdb["decade"] <- as.numeric(ifelse( imdb$year<1970, '1960',ifelse( imdb$year<1980, '1970',ifelse( imdb$year<1990, '1980',ifelse( imdb$year<2000, '1990',ifelse( imdb$year<2010, '2000',ifelse( imdb$year<2020, '2010',NA)))))) )
freq <- count(imdb, 'decade')
qplot(imdb$decade, geom="histogram",
xlab="decade",
ylab="Frequency of movies")
gross_collections_per_decade <- aggregate(as.numeric(imdb$boxOfficeGross) ~ imdb$decade, imdb, sum)
plot(gross_collections_per_decade)
names(gross_collections_per_decade) <- c('decade', 'collection')
qqplot(gross_collections_per_decade$decade, gross_collections_per_decade$collection)
gross_collections_per_year <- aggregate(as.numeric(imdb$boxOfficeGross) ~ imdb$year, imdb, sum)
names(gross_collections_per_year) <- c('year', 'collection')