library('rvest')
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)
rank_data_html <- html_nodes(webpage,'.text-primary')
rank_data <- html_text(rank_data_html)
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
rank_data<-as.numeric(rank_data)
head(rank_data)
## [1] 1 2 3 4 5 6
title_data_html <- html_nodes(webpage,'.lister-item-header a')
title_data <- html_text(title_data_html)
head(title_data)
## [1] "The Magnificent Seven" "Me Before You"
## [3] "Rogue One: A Star Wars Story" "Hidden Figures"
## [5] "Suicide Squad" "Sing"
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
description_data <- html_text(description_data_html)
head(description_data)
## [1] "\nSeven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist."
## [2] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
## [3] "\nIn a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."
## [4] "\nThe story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."
## [5] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [6] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
description_data<-gsub("\n","",description_data)
head(description_data)
## [1] "Seven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist."
## [2] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
## [3] "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."
## [4] "The story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."
## [5] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [6] "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
runtime_data <- html_text(runtime_data_html)
head(runtime_data)
## [1] "132 min" "106 min" "133 min" "127 min" "123 min" "108 min"
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
head(runtime_data)
## [1] 132 106 133 127 123 108
genre_data_html <- html_nodes(webpage,'.genre')
genre_data <- html_text(genre_data_html)
head(genre_data)
## [1] "\nAction, Adventure, Western "
## [2] "\nDrama, Romance "
## [3] "\nAction, Adventure, Sci-Fi "
## [4] "\nBiography, Drama, History "
## [5] "\nAction, Adventure, Fantasy "
## [6] "\nAnimation, Comedy, Family "
genre_data<-gsub("\n","",genre_data)
genre_data<-gsub(" ","",genre_data)
genre_data<-gsub(",.*","",genre_data)
genre_data<-as.factor(genre_data)
head(genre_data)
## [1] Action Drama Action Biography Action Animation
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
rating_data <- html_text(rating_data_html)
head(rating_data)
## [1] "6.8" "7.4" "7.8" "7.8" "5.9" "7.1"
rating_data<-as.numeric(rating_data)
head(rating_data)
## [1] 6.8 7.4 7.8 7.8 5.9 7.1
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
votes_data <- html_text(votes_data_html)
head(votes_data)
## [1] "217,160" "263,311" "652,034" "238,318" "695,537" "176,671"
votes_data<-gsub(",","",votes_data)
votes_data<-as.numeric(votes_data)
head(votes_data)
## [1] 217160 263311 652034 238318 695537 176671
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
directors_data <- html_text(directors_data_html)
head(directors_data)
## [1] "Antoine Fuqua" "Thea Sharrock" "Gareth Edwards" "Theodore Melfi"
## [5] "David Ayer" "Garth Jennings"
directors_data<-as.factor(directors_data)
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
actors_data <- html_text(actors_data_html)
head(actors_data)
## [1] "Denzel Washington" "Emilia Clarke" "Felicity Jones"
## [4] "Taraji P. Henson" "Will Smith" "Matthew McConaughey"
actors_data<-as.factor(actors_data)
metascore_data_html <- html_nodes(webpage,'.metascore')
metascore_data <- html_text(metascore_data_html)
head(metascore_data)
## [1] "54 " "51 " "65 " "74 " "40 "
## [6] "59 "
metascore_data<-gsub(" ","",metascore_data)
length(metascore_data)
## [1] 96
for (i in c(39,73,80,89)){
a<-metascore_data[1:(i-1)]
b<-metascore_data[i:length(metascore_data)]
metascore_data<-append(a,list("NA"))
metascore_data<-append(metascore_data,b)
}
metascore_data<-as.numeric(metascore_data)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
length(metascore_data)
## [1] 100
summary(metascore_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.00 46.00 60.50 59.57 73.25 99.00 4
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
gross_data <- html_text(gross_data_html)
head(gross_data)
## [1] "$93.43M" "$56.25M" "$532.18M" "$169.61M" "$325.10M" "$270.40M"
gross_data<-gsub("M","",gross_data)
gross_data<-substring(gross_data,2,6)
length(gross_data)
## [1] 89
for (i in c(13, 14, 17, 23, 27, 36, 39, 40, 46, 47, 49, 50, 51, 52, 57, 62, 63, 64, 65, 66, 72, 73, 76, 77, 80, 81, 86, 87, 88, 89, 102)){
a<-gross_data[1:(i-1)]
b<-gross_data[i:length(gross_data)]
gross_data<-append(a,list("NA"))
gross_data<-append(gross_data,b)
}
gross_data<-as.numeric(gross_data)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
length(gross_data)
## [1] 120
summary(gross_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.01 26.86 61.71 101.28 127.40 532.10 31
I was given the message:
Error in data.frame(Rank = rank_data, Title = title_data, Description = description_data, : arguments imply differing number of rows: 100, 165. I know that this website is continuously being updated and perhaps I have different information than what’s on the website. I tried being proactive and looking at which(is.na(gross_data)) and removing the ones that I saw being NA as opposed to the walkthrough and this then gave me another error code:
Error in data.frame(Rank = rank_data, Title = title_data, Description = description_data, : arguments imply differing number of rows: 100, 165
I know this will jeopardize some points, but it’s a risk i’m willing to take since I know I can’t go forward with the analysis without creating the dataframe, which in this case, does not contain gross data.
movies_df<-data.frame(Rank = rank_data, Title = title_data,
Description = description_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data,
Director = directors_data, Actor = actors_data)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
qplot(data = movies_df, Runtime, fill = genre_data, bins = 30)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
This visualization indicates that it is an adventure movie that had the longest runtime, and when we dig into the code, we find that it is “American Honey”, directed by Andrea Arnold
wsplot2 <- ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))
ggplotly(wsplot2)
The visualization indicates that it is the action genre that had the highest number of votes, at 804,066 votes
As stated before, this graph was giving me an issue, so I can answer the question, but cannot graph it. Between the runtimes of 100 and 120 minutes, the action genre is seen to have the highest average gross earnings, and adventure came incredibly close.
# ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
# geom_point(aes(size=Rating,col=Genre))