── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.2 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest)
Attaching package: 'rvest'
The following object is masked from 'package:readr':
guess_encoding
library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
#Using CSS selectors to scrape the rankings sectionrank_data_html <-html_nodes(webpage,'.text-primary')#Converting the ranking data to textrank_data <-html_text(rank_data_html)#Let's have a look at the rankingshead(rank_data)
[1] "1." "2." "3." "4." "5." "6."
#Data-Preprocessing: Converting rankings to numericalrank_data<-as.numeric(rank_data)#Let's have another look at the rankingshead(rank_data)
[1] 1 2 3 4 5 6
#Using CSS selectors to scrape the title sectiontitle_data_html <-html_nodes(webpage,'.lister-item-header a')#Converting the title data to texttitle_data <-html_text(title_data_html)#Let's have a look at the titlehead(title_data)
[1] "Arrival" "Hacksaw Ridge"
[3] "Terrifier" "Suicide Squad"
[5] "Batman v Superman: Dawn of Justice" "Me Before You"
#Using CSS selectors to scrape the description sectiondescription_data_html <-html_nodes(webpage,'.ratings-bar+ .text-muted')#Converting the description data to textdescription_data <-html_text(description_data_html)#Let's have a look at the description datahead(description_data)
[1] "\nA linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."
[2] "\nWorld War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
[3] "\nOn Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."
[4] "\nA secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
[5] "\nFearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."
[6] "\nA girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
#Data-Preprocessing: removing '\n'description_data<-gsub("\n","",description_data)#Let's have another look at the description data head(description_data)
[1] "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world."
[2] "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill people and becomes the first man in American history to receive the Medal of Honor without firing a shot."
[3] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."
[4] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
[5] "Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs."
[6] "A girl in a small town forms an unlikely bond with a recently-paralyzed man she's taking care of."
#Using CSS selectors to scrape the Movie runtime sectionruntime_data_html <-html_nodes(webpage,'.text-muted .runtime')#Converting the runtime data to textruntime_data <-html_text(runtime_data_html)#Let's have a look at the runtimehead(runtime_data)
#Data-Preprocessing: removing mins and converting it to numericalruntime_data<-gsub(" min","",runtime_data)runtime_data<-as.numeric(runtime_data)#Let's have another look at the runtime datahead(runtime_data)
[1] 116 139 85 123 151 106
#Using CSS selectors to scrape the Movie genre sectiongenre_data_html <-html_nodes(webpage,'.genre')#Converting the genre data to textgenre_data <-html_text(genre_data_html)#Let's have a look at the runtimehead(genre_data)
#Data-Preprocessing: removing \ngenre_data<-gsub("\n","",genre_data)#Data-Preprocessing: removing excess spacesgenre_data<-gsub(" ","",genre_data)#taking only the first genre of each moviegenre_data<-gsub(",.*","",genre_data)#Convering each genre from text to factorgenre_data<-as.factor(genre_data)#Let's have another look at the genre datahead(genre_data)
[1] Drama Biography Horror Action Action Drama
Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
#Using CSS selectors to scrape the IMDB rating sectionrating_data_html <-html_nodes(webpage,'.ratings-imdb-rating strong')#Converting the ratings data to textrating_data <-html_text(rating_data_html)#Let's have a look at the ratingshead(rating_data)
[1] "7.9" "8.1" "5.6" "5.9" "6.4" "7.4"
#Data-Preprocessing: converting ratings to numericalrating_data<-as.numeric(rating_data)#Let's have another look at the ratings datahead(rating_data)
[1] 7.9 8.1 5.6 5.9 6.4 7.4
#Using CSS selectors to scrape the votes sectionvotes_data_html <-html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')#Converting the votes data to textvotes_data <-html_text(votes_data_html)#Let's have a look at the votes datahead(votes_data)
#Data-Preprocessing: removing commasvotes_data<-gsub(",","",votes_data)#Data-Preprocessing: converting votes to numericalvotes_data<-as.numeric(votes_data)#Let's have another look at the votes datahead(votes_data)
[1] 722916 553431 43200 701352 714484 268259
#Using CSS selectors to scrape the directors sectiondirectors_data_html <-html_nodes(webpage,'.text-muted+ p a:nth-child(1)')#Converting the directors data to textdirectors_data <-html_text(directors_data_html)#Let's have a look at the directors datahead(directors_data)
#Data-Preprocessing: converting directors data into factorsdirectors_data<-as.factor(directors_data)#Using CSS selectors to scrape the actors sectionactors_data_html <-html_nodes(webpage,'.lister-item-content .ghost+ a')#Converting the gross actors data to textactors_data <-html_text(actors_data_html)#Let's have a look at the actors datahead(actors_data)
#Data-Preprocessing: converting actors data into factorsactors_data<-as.factor(actors_data)length(actors_data)
[1] 100
library(stringr)ratings_bar_data <-html_nodes(webpage,'.ratings-bar') %>%# scrape the ratings bar and convert to texthtml_text2()head(ratings_bar_data) # look at the ratings bar
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
21.00 47.00 61.00 60.05 72.50 99.00 5
# scrape the votess bar and convert to textvotes_bar_data <-html_nodes(webpage,'.sort-num_votes-visible') %>%html_text2()head(votes_bar_data) # look at the votes bar data
#Combining all the lists to form a data framemovies_df<-data.frame(Rank = rank_data, Title = title_data,Description = description_data, Runtime = runtime_data,Genre = genre_data, Rating = rating_data,Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data,Director = directors_data, Actor = actors_data)#Structure of the data framestr(movies_df)
'data.frame': 100 obs. of 11 variables:
$ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
$ Title : chr "Arrival" "Hacksaw Ridge" "Terrifier" "Suicide Squad" ...
$ Description : chr "A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appea"| __truncated__ "World War II American Army Medic Desmond T. Doss, who served during the Battle of Okinawa, refuses to kill peop"| __truncated__ "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ ...
$ Runtime : num 116 139 85 123 151 106 108 111 128 107 ...
$ Genre : Factor w/ 8 levels "Action","Adventure",..: 7 4 8 1 1 7 1 7 5 3 ...
$ Rating : num 7.9 8.1 5.6 5.9 6.4 7.4 8 6.5 8 7.6 ...
$ Metascore : num 81 71 NA 40 44 51 65 26 94 81 ...
$ Votes : num 722916 553431 43200 701352 714484 ...
$ Gross_Earning_in_Mil: num 100.5 67.2 NA 325.1 330.3 ...
$ Director : Factor w/ 97 levels "Alex Proyas",..: 29 62 20 25 97 90 93 83 19 81 ...
$ Actor : Factor w/ 90 levels "Aamir Khan","Adam Driver",..: 4 5 41 88 8 28 74 10 73 7 ...
library('ggplot2')plot1 <- movies_dfqplot(data = movies_df,Runtime,fill = Genre,bins =30)+scale_fill_discrete(name ="Genre") +labs(title ="100 Most Popular Feautre Film in 2016 Runtime by Genre") +geom_histogram(position="identity", alpha=0.5, binwidth =5, color ="white")
Warning: `qplot()` was deprecated in ggplot2 3.4.0.
Question 1: Based on the above data, which movie from which Genre had the longest runtime?
Answer: Batman v. Superman: Dawn of Justice (Ultimate Edition) Genre: Action
Title Genre Runtime
19 Batman v Superman: Dawn of Justice (Ultimate Edition) Action 182
Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?
Answer: Captain America: Civil War (Genre: Action) with a votes of 815574.
ggplot(movies_df,aes(x=Runtime,y=Rating))+geom_point(aes(size=Votes,col=Genre, text =paste("Movie Title:", title_data)), alpha =0.7) +labs(title =" 100 Most Popular Feature Film in 2016 Runtime by Ratings")
Warning in geom_point(aes(size = Votes, col = Genre, text = paste("Movie
Title:", : Ignoring unknown aesthetics: text
Title Genre Runtime Votes
5 Captain America: Civil War Action 147 815574
Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.
Answer: The genre animation has the highest average gross earnings in runtime 100 to 120. The gross earnings in mil was 216.33.
ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+geom_point(aes(size=Rating,col=Genre), alpha =0.5) +labs(title =" 100 Most Popular Feature Film in 2016 Gross Earnings in Millions") +scale_y_continuous("Gross Earnings in Millions", limits =c(-10, 600))