── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.3 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.3 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest)
Attaching package: 'rvest'
The following object is masked from 'package:readr':
guess_encoding
url <-'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'#defineing the urlwebpage <-read_html(url) #read url
Rank
#Using CSS selectors to scrape the rankings sectionrank_data_html <-html_nodes(webpage, '.text-primary')#Converting the ranking data to textrank_data <-html_text(rank_data_html)#Data-Preprocessing: Converting rankings to numericalrank_data<-as.numeric(rank_data)#Let's have another look at the rankingshead(rank_data)
[1] 1 2 3 4 5 6
length(rank_data)
[1] 100
Movie Title
#Using CSS selectors to scrape the title sectiontitle_data_html <-html_nodes(webpage,'.lister-item-header a')#Converting the title data to texttitle_data <-html_text(title_data_html)#Let's have a look at the titlehead(title_data)
#Using CSS selectors to scrape the description sectiondescription_data_html <-html_nodes(webpage,'.ratings-bar+ .text-muted')#Converting the description data to textdescription_data <-html_text(description_data_html)#Data-Preprocessing: removing '\n' data cleaningdescription_data<-gsub("\n","",description_data)#Let's have another look at the description data head(description_data)
[1] "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown."
[2] "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."
[3] "In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is rumored to have committed apostasy, and to propagate Catholicism."
[4] "A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence when a masked killer appears at her window."
[5] "Ed and Lorraine Warren travel to North London to help a single mother raising four children alone in a house plagued by a supernatural spirit."
[6] "Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."
length(description_data)
[1] 100
Movie Run Times
#Using CSS selectors to scrape the Movie runtime sectionruntime_data_html <-html_nodes(webpage,'.text-muted .runtime')#Converting the runtime data to textruntime_data <-html_text(runtime_data_html)#Data-Preprocessing: removing mins and converting it to numericalruntime_data<-gsub(" min","",runtime_data)runtime_data<-as.numeric(runtime_data)#Let's have a look at the runtimehead(runtime_data)
[1] 85 123 161 82 134 117
length(runtime_data)
[1] 100
Genre
genre_data_html <-html_nodes(webpage,'.genre') #Using CSS selectors to scrape the Movie genre sectiongenre_data <-html_text(genre_data_html) #Converting the genre data to texthead(genre_data)
genre_data<-gsub("\n","",genre_data) #removing \ngenre_data<-gsub(" ","",genre_data) #removing excess spacesgenre_data<-gsub(",.*","",genre_data) #taking only the first genre of each moviegenre_data<-as.factor(genre_data) #Convering each genre from text to factorhead(genre_data)
[1] Horror Action Drama Horror Horror Horror
9 Levels: Action Adventure Animation Biography Comedy Crime Drama ... Horror
length(genre_data)
[1] 100
IMDB Ratings
#Using CSS selectors to scrape the IMDB rating sectionrating_data_html <-html_nodes(webpage,'.ratings-imdb-rating strong')#Converting the ratings data to textrating_data <-html_text(rating_data_html)#Let's have a look at the ratingshead(rating_data)
[1] "5.6" "5.9" "7.2" "6.6" "7.3" "7.3"
length(rating_data)
[1] 100
#Data-Preprocessing: converting ratings to numericalrating_data<-as.numeric(rating_data)#Let's have another look at the ratings datahead(rating_data)
[1] 5.6 5.9 7.2 6.6 7.3 7.3
length(rating_data)
[1] 100
Number of votes
#Using CSS selectors to scrape the votes sectionvotes_data_html <-html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')#Converting the votes data to textvotes_data <-html_text(votes_data_html)#Let's have a look at the votes datahead(votes_data)
#Data-Preprocessing: removing commasvotes_data<-gsub(",","",votes_data)#Data-Preprocessing: converting votes to numericalvotes_data<-as.numeric(votes_data)#Let's have another look at the votes datahead(votes_data)
[1] 47604 710171 119435 149202 292206 532800
length(votes_data)
[1] 100
Director
#Using CSS selectors to scrape the directors sectiondirectors_data_html <-html_nodes(webpage,'.text-muted+ p a:nth-child(1)')#Converting the directors data to textdirectors_data <-html_text(directors_data_html)#Let's have a look at the directors datahead(directors_data)
#Data-Preprocessing: converting directors data into factorsdirectors_data<-as.factor(directors_data)
Actors ! 99??
#Using CSS selectors to scrape the actors sectionactors_data_html <-html_nodes(webpage,'.lister-item-content .ghost+ a')#Converting the gross actors data to textactors_data <-html_text(actors_data_html)#Let's have a look at the actors datahead(actors_data)
#Data-Preprocessing: converting actors data into factorsactors_data<-as.factor(actors_data)
Meta Score
#Using CSS selectors to scrape the metascore sectionmetascore_data_html <-html_nodes(webpage,'.metascore')#Converting the runtime data to textmetascore_data <-html_text(metascore_data_html)#Let's have a look at the metascore head(metascore_data)
[1] "40 " "79 " "67 " "65 " "63 "
[6] "71 "
#Data-Preprocessing: removing extra space in metascoremetascore_data<-gsub(" ","",metascore_data)#Lets check the length of metascore datalength(metascore_data)
[1] 95
MetaScore
ratings_bar_data <-html_nodes(webpage,'.ratings-bar') |># scrape the ratings bar and convert to texthtml_text2()head(ratings_bar_data) # look at the ratings bar
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
21.00 47.00 62.00 60.36 72.50 99.00 5
length(metascore_data)
[1] 100
Gross Earning
# scrape the votess bar and convert to textvotes_bar_data <-html_nodes(webpage,'.sort-num_votes-visible') |>html_text2()head(votes_bar_data) # look at the votes bar data
#Combining all the lists to form a data framemovies_df<-data.frame(Rank = rank_data, Title = title_data,Description = description_data, Runtime = runtime_data,Genre = genre_data, Rating = rating_data,Metascore = metascore_data, Votes = votes_data, Gross_Earning_in_Mil = gross_data,Director = directors_data) #, Actor = actors_data#Structure of the data framestr(movies_df)
'data.frame': 100 obs. of 10 variables:
$ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
$ Title : chr "Terrifier" "Suicide Squad" "Silence" "Hush" ...
$ Description : chr "On Halloween night, Tara Heyes finds herself as the obsession of a sadistic murderer known as Art the Clown." "A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive "| __truncated__ "In the 17th century, two Portuguese Jesuit priests travel to Japan in an attempt to locate their mentor, who is"| __truncated__ "A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in silence "| __truncated__ ...
$ Runtime : num 85 123 161 82 134 117 139 145 128 108 ...
$ Genre : Factor w/ 9 levels "Action","Adventure",..: 9 1 7 9 9 9 4 7 5 3 ...
$ Rating : num 5.6 5.9 7.2 6.6 7.3 7.3 8.1 8.1 8 7.1 ...
$ Metascore : num NA 40 79 67 65 63 71 85 94 59 ...
$ Votes : num 47604 710171 119435 149202 292206 ...
$ Gross_Earning_in_Mil: num NA 325.1 7.1 NA 102.4 ...
$ Director : Factor w/ 97 levels "Alessandro Carloni",..: 19 22 61 65 44 59 63 71 18 34 ...
Analyzing scraped data from the web
Question 1: which Movie from which Genre had the longest run time?
The movie Silence has the longest run time and it belongs to the Drama genre.
longrun <- movies_df |>group_by(Genre) |>top_n(1, wt = Runtime) ggplot(longrun, aes(x = Genre, y = Runtime, fill = Title)) +geom_bar(stat ="identity", position ="dodge", alpha = .65) +labs(title ="Runtime and Genre",x ="Genre", y ="Runtime( Minutes)", fill ="Movie Title") +theme_minimal() +theme(axis.text.x =element_text(angle =35, hjust =1,),plot.title =element_text(hjust = .5) )
Question 2: In the Runtime of 130-160 mins, which genre has the highest votes?
The genre Biography has the overall average highest votes