---
Title: "IMDB_Analysis Group Assignment - Jarvis Team"
Output: html_document
SNO STUDENT NAME STUDENT ID STUDENT CONTACT_ID
1 Sai Ravi Kumar Rayapudi 71620061 sai_rayapudi_2016@cba.isb.edu
2 Sree Kashyap Addanki 71620072 sreekashyap_addanki_2016@cba.isb.edu
3 Radhika Palisetti 71620054 Radhika_Palisetti_2016@cba.isb.edu
4 Bharat Kumar Bathula 71620014 bharat_bathula_2016@cba.isb.edu
---
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(“rvest”) library(“XML”) library(“xml2”)
url = “http://www.imdb.com/chart/top?ref_=nv_wl_img_3” page = read_html(url) page movie.nodes = html_nodes(page,‘.titleColumn a’)
xmlTreeParse(movie.nodes[[1]])
movie.link = sapply(html_attrs(movie.nodes),[[,‘href’) movie.link = paste0(“http://www.imdb.com”,movie.link) movie.cast = sapply(html_attrs(movie.nodes),[[,‘title’) movie.cast movie.link movie.name = html_text(movie.nodes) movie.name
year = gsub(“)”,“”, # Removing ) gsub(“\(”,“”, # Removing ( html_text( # get text of HTML node
html_nodes(page,‘.secondaryInfo’) ))) year
rating.nodes = html_nodes(page,‘.imdbRating’) rating.nodes ### Check One node xmlTreeParse(rating.nodes[[1]]) ### Correct the node rating.nodes = html_nodes(page,‘.imdbRating strong’) rating.nodes
votes = as.numeric(gsub(‘,’,‘’, gsub(’ user ratings’,‘’, gsub(’.*?based on ‘,’‘, sapply(html_attrs(rating.nodes),[[,’title’) ))) votes rating = as.numeric(html_text(rating.nodes)) rating
top250 = data.frame(movie.name, movie.cast, movie.link,year,votes,rating) head(top250)
write.csv(top250,‘IMDB Top 250.csv’, row.names = F)
library (dplyr) library(stringr)
moviename<-c() director <-c() stars <- c() taglines <- c() genres <- c() storyline <- c() boxOfficeBudget <- c() boxOfficeGross <- c() length(data$movie.name) ### we have 250 movies , so we should iterate 250 times, every time system will click every link and scrap the data from that page for(i in 1:250){ moviename[i]<- movie.name[i]
library("rvest")
library("XML")
library("xml2")
page= read_html(movie.link[i])
director[i] = page %>% html_nodes('.summary_text+ .credit_summary_item ') %>% html_text(trim = TRUE) %>%paste(collapse=" ,")
# stars data
stars[i] = page %>% html_nodes(‘.credit_summary_item~ .credit_summary_item+ .credit_summary_item’) %>% html_text() %>% paste(collapse=" ,“) tl = page %>% html_nodes(‘#titleStoryLine .txt-block:nth-child(8)’) %>% html_text(trim = TRUE) if(length(tl) != 0) { taglines[i] = gsub(”“,”“, gsub(”Taglines:“,”“,tl)) }
gs1 = page %>% html_nodes('.see-more.canwrap~ .canwrap') %>% html_text(trim = TRUE)
genres[i] = gsub("[|]", ",", gsub("\n","", gsub("Genres:","",gs1)))
sLine1 = page %>% html_nodes("#titleStoryLine p") %>% html_text(trim = TRUE)
storyline[i] = gsub("\n","", gsub("Written by\n", "Written by ",sLine1))
boxOfficeBudget1= page %>% html_nodes('.txt-block:nth-child(11)') %>% html_text(trim = TRUE) %>% paste(collapse=" ")
if(length(boxOfficeBudget1!= 0)) {
library(stringr)
boxOfficeBudget[i]= page %>% html_nodes('.txt-block:nth-child(11)') %>% html_text(trim = TRUE) %>% paste(collapse=" ")
boxOfficeBudget[i] = gsub("Budget:", "", boxOfficeBudget[i]) %>% trimws(which="both")
}
else {
boxOfficeBudget[i] = NA
}
boxOfficeGross1 = page %>% html_nodes('#titleDetails .txt-block:nth-child(12)') %>% html_text(trim = TRUE) %>% paste(collapse=" ")
if(length(boxOfficeGross!= 0)) {
library(stringr)
boxOfficeGross[i] = page %>% html_nodes('#titleDetails .txt-block:nth-child(12)') %>% html_text(trim = TRUE) %>% paste(collapse=" ")
boxOfficeGross[i] = gsub("Gross:", "", boxOfficeGross[i]) %>% trimws(which="both")
}
else {
boxOfficeGross[i] = NA
}
}
movieInfo1 = data.frame(moviename, director, stars, taglines, genres, storyline, boxOfficeBudget, boxOfficeGross, movie.cast, movie.link,year,votes,rating)
#movieInfo1 is the dataframe contains all the related movie information
movieInfo1
write.csv(movieInfo1,'IMDBTop250.movieInfo3.csv', row.names = F)
install.packages(dplyr) library(dplyr) data <-read.csv(“IMDBTop250.movieInfo3.csv”) filter(data, data\(year>=1996 & data\)year <= 1998) data
data <-read.csv(“IMDBTop250.movieInfo3.csv”) data
movieInfo1 = data.frame(moviename, director, stars, taglines, genres, storyline, boxOfficeBudget, boxOfficeGross, movie.cast, movie.link,year,votes,rating) write.csv(movieInfo1,‘IMDBTop250.movieInfo3.csv’, row.names = F)
imdb <- read.csv(“IMDBTop250.movieInfo3.csv”, strip.white=TRUE) imdb\(genres<-gsub("[[:space:]]", "",imdb\)genres) imdb\(boxOfficeGross <- gsub("\\\)|,“,”“, imdb\(boxOfficeGross) write.csv(imdb, file ="IMDB_final.csv" ) imdb_split <- separate_rows(imdb, genres, sep = ",") genre_count<-table(imdb_split\)genres) genre_count Final_genre_table = as.data.frame(genre_count) Final_genre_table
summary(imdb) ### structure of imdb str(imdb) library(ggplot2) library(plyr)
imdb[“decade”] <- as.numeric(ifelse( imdb\(year<1970, '1960',ifelse( imdb\)year<1980, ‘1970’,ifelse( imdb\(year<1990, '1980',ifelse( imdb\)year<2000, ‘1990’,ifelse( imdb\(year<2010, '2000',ifelse( imdb\)year<2020, ‘2010’,NA)))))) ) ### imdb[“decade”] <- ifelse( imdb\(year<1970, '1960',ifelse( imdb\)year<1980, ‘1970’,ifelse( imdb\(year<1990, '1980',ifelse( imdb\)year<2000, ‘1990’,ifelse( imdb\(year<2010, '2000',ifelse( imdb\)year<2020, ‘2010’,NA))))))
freq <- count(imdb, ‘decade’) ### We have observed that many top movies are released before 1970 qplot(imdb$decade, geom=“histogram”, xlab=“decade”, ylab=“Frequency of movies”)
gross_collections_per_decade <- aggregate(as.numeric(imdb\(boxOfficeGross) ~ imdb\)decade, imdb, sum)
plot(gross_collections_per_decade) names(gross_collections_per_decade) <- c(‘decade’, ‘collection’)
qqplot(gross_collections_per_decade\(decade, gross_collections_per_decade\)collection)
gross_collections_per_year <- aggregate(as.numeric(imdb\(boxOfficeGross) ~ imdb\)year, imdb, sum)
names(gross_collections_per_year) <- c(‘year’, ‘collection’)