library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

#Reading the HTML code from the website
webpage <- read_html(url)

# Using CSS selectors to scrape the rankings slection 
rank_data_html <- html_nodes(webpage,'.text-primary')

# Convert the ranking data to text 
rank_data <- html_text(rank_data_html)

# Look at the rankings 
head(rank_data)
## [1] "1." "2." "3." "4." "5." "6."
# Convert rankings to numerical 
rank_data <- as.numeric(rank_data)

# One more look at the rankings 
head(rank_data)
## [1] 1 2 3 4 5 6
# Using CSS selectors to scrape the title section 
title_data_html <- html_nodes(webpage, '.lister-item-header a')

# Convert title data to text 
title_data <- html_text(title_data_html)

# Look at the titles
head(title_data)
## [1] "Deadpool"                     "Suicide Squad"               
## [3] "Rogue One: A Star Wars Story" "Split"                       
## [5] "Marauders"                    "Office Christmas Party"
# CSS selector to scrape the description section 
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

# Convert data to text
description_data <- html_text(description_data_html)
head(description_data)
## [1] "\n    A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                                              
## [2] "\n    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                                                
## [3] "\n    The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the plans for the Death Star."                                                                                                                              
## [4] "\n    Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."                                                                                      
## [5] "\n    When a bank is hit by a brutal heist, all evidence points to the owner and his high-powered clients. But as a group of FBI agents dig deeper into the case - and the deadly heists continue - it becomes clear that a larger conspiracy is at play."
## [6] "\n    When his uptight CEO sister threatens to shut down his branch, the branch manager throws an epic Christmas party in order to land a big client and save the day, but the party gets way out of hand..."
# Removing '\n'
description_data<-gsub("\n","",description_data)

#Look at the description data again 
head(description_data)
## [1] "    A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks."                                                                                                              
## [2] "    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                                                
## [3] "    The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the plans for the Death Star."                                                                                                                              
## [4] "    Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th."                                                                                      
## [5] "    When a bank is hit by a brutal heist, all evidence points to the owner and his high-powered clients. But as a group of FBI agents dig deeper into the case - and the deadly heists continue - it becomes clear that a larger conspiracy is at play."
## [6] "    When his uptight CEO sister threatens to shut down his branch, the branch manager throws an epic Christmas party in order to land a big client and save the day, but the party gets way out of hand..."
# Use CSS selector to scrape the Movie runtime section 
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

# Convert data to text 
runtime_data <- html_text(runtime_data_html)
head(runtime_data)
## [1] "108 min" "123 min" "133 min" "117 min" "107 min" "105 min"
# Remove mins and convert to numerical 
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

# Look at data 
head(runtime_data)
## [1] 108 123 133 117 107 105
# Using CSS selector to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

# Convert to text 
genre_data <- html_text(genre_data_html)
head(genre_data)
## [1] "\nAction, Adventure, Comedy            " 
## [2] "\nAction, Adventure, Fantasy            "
## [3] "\nAction, Adventure, Sci-Fi            " 
## [4] "\nHorror, Thriller            "          
## [5] "\nAction, Crime, Thriller            "   
## [6] "\nComedy            "
#Removing \n and removing excess spaces 
genre_data<-gsub("\n","",genre_data)
genre_data<-gsub(" ","",genre_data)

# Taking only the first genere of each movie 
genre_data<-gsub(",.*","",genre_data)

# Convert from text to factor 
genre_data<-as.factor(genre_data)

#Look at data 
head(genre_data)
## [1] Action Action Action Horror Action Comedy
## Levels: Action Adventure Animation Biography Comedy Crime Drama Horror
# Using CSS selctor to scrape the IMDB rating section 
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

# Convert ratings data to text 
rating_data <- html_text(rating_data_html)
head(rating_data)
## [1] "8.0" "6.0" "7.8" "7.3" "5.5" "5.8"
# Convert ratings to numerical
rating_data<-as.numeric(rating_data)

#Look at data 
head(rating_data)
## [1] 8.0 6.0 7.8 7.3 5.5 5.8
# Use CSS selector to scrape the votes section 
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

# Convert data to text 
votes_data <- html_text(votes_data_html)
head(votes_data)
## [1] "894,232" "594,545" "547,528" "417,569" "19,271"  "67,902"
# Remove the commas 
votes_data<-gsub(",","",votes_data)

# Convert votes to numerical 
votes_data<-as.numeric(votes_data)

# Look at the data 
head(votes_data)
## [1] 894232 594545 547528 417569  19271  67902
# Use CSS selectors to scrape the directors section 
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

# Convert directors data to text 
directors_data <- html_text(directors_data_html)
head(directors_data)
## [1] "Tim Miller"         "David Ayer"         "Gareth Edwards"    
## [4] "M. Night Shyamalan" "Steven C. Miller"   "Josh Gordon"
# Convert directors data into factors 
directors_data<-as.factor(directors_data)

# Look at data 
head(directors_data)
## [1] Tim Miller         David Ayer         Gareth Edwards     M. Night Shyamalan
## [5] Steven C. Miller   Josh Gordon       
## 98 Levels: Alex Proyas Ana Lily Amirpour André Øvredal ... Zack Snyder
# Use CSS selectors to scrape the actors section 
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

# Convert actors data into text 
actors_data <- html_text(actors_data_html)

# Convert actors data into factors 
actors_data<-as.factor(actors_data)

# Look at data 
head(actors_data)
## [1] Ryan Reynolds  Will Smith     Felicity Jones James McAvoy   Bruce Willis  
## [6] Jason Bateman 
## 90 Levels: Aamir Khan Alexander Skarsgård Amy Adams ... Zoey Deutch
#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')

#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)

#Let's have a look at the metascore data
head(metascore_data)
## [1] "65        " "40        " "65        " "62        " "42        "
## [6] "42        "
#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)

#Lets check the length of metascore data
length(metascore_data)
## [1] 98
for (i in c(33,58,74,89)){
  a<-metascore_data[1:(i-1)]

  b<-metascore_data[i:length(metascore_data)]

  metascore_data<-append(a,list("NA"))

  metascore_data<-append(metascore_data,b)

}

#Data-Preprocessing: converting metascore to numerical
metascore_data<-as.numeric(metascore_data)
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
length(metascore_data)
## [1] 102
metascore_data <- metascore_data[-c(101,102)]
summary(metascore_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    21.0    43.5    60.0    58.2    72.0    99.0       4
# Use CSS selector to scrape gross revenue section 
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')

# Convert gross revenue data to text 
gross_data <- html_text(gross_data_html)
head(gross_data)
## [1] "$363.07M" "$325.10M" "$532.18M" "$138.29M" "$54.77M"  "$248.76M"
# Removing '$' and 'M' signs 
gross_data<-gsub("M","",gross_data)
gross_data<-substring(gross_data,2,6)

# Length of gross data 
length(gross_data)
## [1] 91
#Filling missing entries with NA
for (i in c(6,11,33,53,58,80,85,86,88, 94)){

a<-gross_data[1:(i-1)]

b<-gross_data[i:length(gross_data)]

gross_data<-append(a,list("NA"))

gross_data<-append(gross_data,b)

}

#Data-Preprocessing: converting gross to numerical
gross_data<-as.numeric(gross_data)
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
length(gross_data)
## [1] 101
gross_data <- gross_data[-c(101)]
summary(gross_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.02   20.97   53.75   92.32  110.67  532.10      10
#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data,

Description = description_data, Runtime = runtime_data,

Genre = genre_data, Rating = rating_data,

Metascore = metascore_data, Votes = votes_data,                                                             Gross_Earning_in_Mil = gross_data,

Director = directors_data, Actor = actors_data)

#Structure of the data frame

str(movies_df)
## 'data.frame':    100 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : chr  "Deadpool" "Suicide Squad" "Rogue One: A Star Wars Story" "Split" ...
##  $ Description         : chr  "    A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the"| __truncated__ "    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defens"| __truncated__ "    The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the plans for the Death Star." "    Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape befo"| __truncated__ ...
##  $ Runtime             : num  108 123 133 117 107 105 107 115 116 139 ...
##  $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 1 1 1 8 1 5 3 1 7 4 ...
##  $ Rating              : num  8 6 7.8 7.3 5.5 5.8 7.6 7.5 7.9 8.1 ...
##  $ Metascore           : num  65 40 65 62 42 42 81 72 81 71 ...
##  $ Votes               : num  894232 594545 547528 417569 19271 ...
##  $ Gross_Earning_in_Mil: num  363 325.1 532.1 138.2 54.8 ...
##  $ Director            : Factor w/ 98 levels "Alex Proyas",..: 94 24 34 61 88 54 83 86 29 66 ...
##  $ Actor               : Factor w/ 90 levels "Aamir Khan","Alexander Skarsgård",..: 73 86 32 37 14 38 6 8 3 4 ...
head(movies_df)
##   Rank                        Title
## 1    1                     Deadpool
## 2    2                Suicide Squad
## 3    3 Rogue One: A Star Wars Story
## 4    4                        Split
## 5    5                    Marauders
## 6    6       Office Christmas Party
##                                                                                                                                                                                                                                               Description
## 1                                                                                                                   A wisecracking mercenary gets experimented on and becomes immortal but ugly, and sets out to track down the man who ruined his looks.
## 2                                                                     A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse.
## 3                                                                                                                                   The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the plans for the Death Star.
## 4                                                                                           Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th.
## 5     When a bank is hit by a brutal heist, all evidence points to the owner and his high-powered clients. But as a group of FBI agents dig deeper into the case - and the deadly heists continue - it becomes clear that a larger conspiracy is at play.
## 6                                                  When his uptight CEO sister threatens to shut down his branch, the branch manager throws an epic Christmas party in order to land a big client and save the day, but the party gets way out of hand...
##   Runtime  Genre Rating Metascore  Votes Gross_Earning_in_Mil
## 1     108 Action    8.0        65 894232               363.00
## 2     123 Action    6.0        40 594545               325.10
## 3     133 Action    7.8        65 547528               532.10
## 4     117 Horror    7.3        62 417569               138.20
## 5     107 Action    5.5        42  19271                54.77
## 6     105 Comedy    5.8        42  67902                   NA
##             Director          Actor
## 1         Tim Miller  Ryan Reynolds
## 2         David Ayer     Will Smith
## 3     Gareth Edwards Felicity Jones
## 4 M. Night Shyamalan   James McAvoy
## 5   Steven C. Miller   Bruce Willis
## 6        Josh Gordon  Jason Bateman
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)

q1 <-movies_df %>% select(Title, Rank, Title, Runtime, Genre) %>%
  filter(Runtime == max(Runtime))
q1
##     Title Rank Runtime  Genre
## 1 Silence   57     161  Drama
## 2  Dangal   71     161 Action
ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))

q2 <- movies_df %>% select(Title,Rank, Runtime, Votes, Genre) %>% filter(between(Runtime, 130, 160))
q2_plot <- q2 %>% ggplot(aes(x=Genre, y= Votes)) +
  geom_bar(stat='identity') + 
  xlab("Movie Genre") + 
  ylab("Votes") +
  ggtitle("Movie Genres by Vote") +
  coord_flip()
q2_plot

q2_df <- q2 %>% filter(Votes == max(Votes))
q2_df
##                        Title Rank Runtime  Votes  Genre
## 1 Captain America: Civil War   13     147 655335 Action
ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))
## Warning: Removed 10 rows containing missing values (geom_point).

#Notes:

From the visualizations we are able to see that action movies had the highest voting rate.

Drama and action movies ad the highest run times. It is surprising how many people watch action moves as i would presume people would want to want a comedy or dramatic movies to just sit back and end the day or start a day.