library('rvest')
url <- 'https://www.imdb.com/search/title/?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)
rank_data_html <- html_nodes(webpage,'.text-primary')
rank_data <- html_text(rank_data_html)
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
rank_data <- as.numeric(rank_data)
head(rank_data)
## [1] 1 2 3 4 5 6
title_data_html <- html_nodes(webpage,'.lister-item-header a')
title_data <- html_text(title_data_html)
head(title_data)
## [1] "The Magnificent Seven" "Me Before You"
## [3] "Rogue One: A Star Wars Story" "Hidden Figures"
## [5] "Suicide Squad" "Sing"
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
description_data <- html_text(description_data_html)
head(description_data)
## [1] "\nSeven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist."
## [2] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
## [3] "\nIn a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."
## [4] "\nThe story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."
## [5] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [6] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
description_data <- gsub("\n","",description_data)
head(description_data)
## [1] "Seven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town from the private army of a destructive industrialist."
## [2] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
## [3] "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death Star, the Empire's ultimate weapon of destruction."
## [4] "The story of a team of female African-American mathematicians who served a vital role in NASA during the early years of the U.S. space program."
## [5] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
## [6] "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same."
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
runtime_data <- html_text(runtime_data_html)
head(runtime_data)
## [1] "132 min" "106 min" "133 min" "127 min" "123 min" "108 min"
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
head(runtime_data)
## [1] 132 106 133 127 123 108
genre_data_html <- html_nodes(webpage,'.genre')
genre_data <- html_text(genre_data_html)
head(genre_data)
## [1] "\nAction, Adventure, Western "
## [2] "\nDrama, Romance "
## [3] "\nAction, Adventure, Sci-Fi "
## [4] "\nBiography, Drama, History "
## [5] "\nAction, Adventure, Fantasy "
## [6] "\nAnimation, Comedy, Family "
genre_data<-gsub("\n","",genre_data)
genre_data<-gsub(" ","",genre_data)
genre_data<-gsub(",.*","",genre_data)
genre_data<-as.factor(genre_data)
head(genre_data)
## [1] Action Drama Action Biography Action Animation
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
rating_data <- html_text(rating_data_html)
head(rating_data)
## [1] "6.8" "7.4" "7.8" "7.8" "5.9" "7.1"
rating_data<-as.numeric(rating_data)
head(rating_data)
## [1] 6.8 7.4 7.8 7.8 5.9 7.1
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
votes_data <- html_text(votes_data_html)
head(votes_data)
## [1] "217,177" "263,326" "652,047" "238,330" "695,550" "176,676"
votes_data<-gsub(",","",votes_data)
votes_data<-as.numeric(votes_data)
head(votes_data)
## [1] 217177 263326 652047 238330 695550 176676
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
directors_data <- html_text(directors_data_html)
head(directors_data)
## [1] "Antoine Fuqua" "Thea Sharrock" "Gareth Edwards" "Theodore Melfi"
## [5] "David Ayer" "Garth Jennings"
directors_data<-as.factor(directors_data)
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')
actors_data <- html_text(actors_data_html)
head(actors_data)
## [1] "Denzel Washington" "Emilia Clarke" "Felicity Jones"
## [4] "Taraji P. Henson" "Will Smith" "Matthew McConaughey"
actors_data<-as.factor(actors_data)
head(actors_data)
## [1] Denzel Washington Emilia Clarke Felicity Jones
## [4] Taraji P. Henson Will Smith Matthew McConaughey
## 92 Levels: Adam Sandler Alexander Skarsgård Amy Adams ... Zoey Deutch
metascore_data_html <- html_nodes(webpage,'.metascore')
metascore_data <- html_text(metascore_data_html)
head(metascore_data)
## [1] "54 " "51 " "65 " "74 " "40 "
## [6] "59 "
metascore_data<-gsub(" ","",metascore_data)
head(metascore_data)
## [1] "54" "51" "65" "74" "40" "59"
length(metascore_data)
## [1] 96
for (i in c(15,27,58,74)){
a<-metascore_data[1:(i-1)]
b<-metascore_data[i:length(metascore_data)]
metascore_data<-append(a,list("NA"))
metascore_data<-append(metascore_data,b)
}
metascore_data<-as.numeric(metascore_data)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
length(metascore_data)
## [1] 100
summary(metascore_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.00 46.00 60.50 59.57 73.25 99.00 4
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
gross_data <- html_text(gross_data_html)
gross_data<-gsub("M","",gross_data)
gross_data<-substring(gross_data,2,6)
head(gross_data)
## [1] "93.43" "56.25" "532.1" "169.6" "325.1" "270.4"
length(gross_data)
## [1] 89
for (i in c(15,27,45,51,55,58,61,74,75,83,85)){
a<-gross_data[1:(i-1)]
b<-gross_data[i:length(gross_data)]
gross_data<-append(a,list("NA"))
gross_data<-append(gross_data,b)
}
gross_data<-as.numeric(gross_data)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
length(gross_data)
## [1] 100
summary(gross_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.01 26.86 61.71 101.28 127.40 532.10 11
movies_df<-data.frame(Rank = rank_data, Title = title_data,
Description = description_data, Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data,
Director = directors_data, Actor = actors_data)
str(movies_df)
## 'data.frame': 100 obs. of 11 variables:
## $ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "The Magnificent Seven" "Me Before You" "Rogue One: A Star Wars Story" "Hidden Figures" ...
## $ Description : chr "Seven gunmen from a variety of backgrounds are brought together by a vengeful young widow to protect her town f"| __truncated__ "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of." "In a time of conflict, a group of unlikely heroes band together on a mission to steal the plans to the Death St"| __truncated__ "The story of a team of female African-American mathematicians who served a vital role in NASA during the early "| __truncated__ ...
## $ Runtime : num 132 106 133 127 123 108 128 108 139 116 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 1 7 1 4 1 3 5 1 4 7 ...
## $ Rating : num 6.8 7.4 7.8 7.8 5.9 7.1 8 8 8.1 7.9 ...
## $ Metascore : num 54 51 65 74 40 59 94 65 71 81 ...
## $ Votes : num 217177 263326 652047 238330 695550 ...
## $ Gross_Earning_in_Mil: num 93.4 56.2 532.1 169.6 325.1 ...
## $ Director : Factor w/ 99 levels "Aisling Walsh",..: 11 91 34 92 26 36 20 94 61 30 ...
## $ Actor : Factor w/ 92 levels "Adam Sandler",..: 19 25 30 85 91 59 74 75 4 3 ...
library('ggplot2')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
q1 <- movies_df %>%
filter(Runtime >= 151) %>%
select(Title, Genre, Runtime)
qq1 <- qplot(data = q1,Runtime,fill = Title,bins = 6)
ggplotly(qq1)
Based on the above data, which movie from which Genre had the longest runtime?
American Honey, an adventure movie.
ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))
q2 <- movies_df %>%
group_by(Genre) %>%
arrange(desc(Genre)) %>%
filter(Runtime >= 130 & Runtime <= 160) %>%
select(Title, Runtime, Votes, Genre)
qq2 <- qplot(data = q2,Runtime, Votes, fill = Genre,bins = 15)
## Warning in geom_point(bins = 15): Ignoring unknown parameters: `bins`
ggplotly(qq2)
# Action
colSums(q2[c(12, 13, 14, 15, 16, 17), 3])
## Votes
## 2973435
# Adventure
colSums(q2[c(11), 3])
## Votes
## 485257
# Animation
colSums(q2[c(10), 3])
## Votes
## 87263
# Biography
colSums(q2[c(8, 9), 3])
## Votes
## 581352
# Drama
colSums(q2[c(2, 3, 4, 5, 6, 7), 3])
## Votes
## 775140
# Humor
colSums(q2[c(1), 3])
## Votes
## 278875
Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?
The action genre has the highest amount of votes at 2,972,908 votes
ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))
## Warning: Removed 11 rows containing missing values (`geom_point()`).
nona_movies_df <- movies_df %>%
filter(!is.na(Gross_Earning_in_Mil) & !is.na(Runtime) & !is.na(Genre))
q3 <- nona_movies_df %>%
filter(Runtime >= 100 & Runtime <= 120) %>%
select(Runtime, Genre, Gross_Earning_in_Mil) %>%
group_by(Genre, Gross_Earning_in_Mil) %>%
arrange(desc(Genre))
# Action
colSums(q3[28:43, 3])/16
## Gross_Earning_in_Mil
## 100.5694
# Adventure
colSums(q3[c(26,27),3])/2
## Gross_Earning_in_Mil
## 41.12
# Animation
colSums(q3[c(22,23,24,25), 3])/4
## Gross_Earning_in_Mil
## 216.33
# Biography
colSums(q3[c(17,18,19,20,21),3])/5
## Gross_Earning_in_Mil
## 96.638
# Comedy
colSums(q3[12:16, 3])/5
## Gross_Earning_in_Mil
## 148.306
# Crime
colSums(q3[c(9,10,11), 3])/3
## Gross_Earning_in_Mil
## 80.21667
# Drama
colSums(q3[2:8, 3])/7
## Gross_Earning_in_Mil
## 62.74286
# Humor
colSums(q3[c(1), 3])/1
## Gross_Earning_in_Mil
## 5.88
Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.
Animation movies have the highest average fross earnings in runtime 100 to 120 minutes