#install.packages('rvest')
library(rvest)

## Warning: package 'rvest' was built under R version 3.6.3

## Loading required package: xml2

library(tidyverse)

## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.6.2

## Warning: package 'stringr' was built under R version 3.6.3

## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()

#Specifying the url for desired website to be scraped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

#Reading the HTML code from the website
webpage <- read_html(url)

Rankings

#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

#Converting the ranking data to text
rank_data <- html_text(rank_data_html)

#Let's have a look at the rankings
head(rank_data)

## [1] "1." "2." "3." "4." "5." "6."

#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

#Let's have another look at the rankings
head(rank_data)

## [1] 1 2 3 4 5 6

Titles

#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)

#Let's have a look at the title
head(title_data)

## [1] "Moana"                                      
## [2] "Moonlight"                                  
## [3] "Suicide Squad"                              
## [4] "Rogue One: A Star Wars Story"               
## [5] "Miss Peregrine's Home for Peculiar Children"
## [6] "La La Land"

#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

#Let's have another look at the description data 
head(description_data)

## [1] "    In Ancient Polynesia, when a terrible curse incurred by the Demigod Maui reaches Moana's island, she answers the Ocean's call to seek out the Demigod to set things right."                                                                   
## [2] "    A young African-American man grapples with his identity and sexuality while experiencing the everyday struggles of childhood, adolescence, and burgeoning adulthood."                                                                         
## [3] "    A secret government agency recruits some of the most dangerous incarcerated super-villains to form a defensive task force. Their first mission: save the world from the apocalypse."                                                          
## [4] "    The daughter of an Imperial scientist joins the Rebel Alliance in a risky move to steal the Death Star plans."                                                                                                                                
## [5] "    When Jacob (Asa Butterfield) discovers clues to a mystery that stretches across time, he finds Miss Peregrine's Home for Peculiar Children. But the danger deepens after he gets to know the residents and learns about their special powers."
## [6] "    While navigating their careers in Los Angeles, a pianist and an actress fall in love while attempting to reconcile their aspirations for the future."

#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

head(runtime_data)

## [1] 107 111 123 133 127 128

#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

#Convering each genre from text to factor
genre_data<-as.factor(genre_data)

#Let's have another look at the genre data
head(genre_data)

## [1] Animation Drama     Action    Action    Adventure Comedy   
## 8 Levels: Action Adventure Animation Biography Comedy Crime ... Horror

#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

#Let's have another look at the ratings data
head(rating_data)

## [1] 7.6 7.4 6.0 7.8 6.7 8.0

#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

#Converting the votes data to text
votes_data <- html_text(votes_data_html)

#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

#Let's have another look at the votes data
head(votes_data)

## [1] 254970 258637 580790 532957 150547 480744

#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

#Converting the directors data to text
directors_data <- html_text(directors_data_html)

#Let's have a look at the directors data
head(directors_data)

## [1] "Ron Clements"    "Barry Jenkins"   "David Ayer"      "Gareth Edwards" 
## [5] "Tim Burton"      "Damien Chazelle"

#Data-Preprocessing: converting directors data into factors
directors_data<-as.factor(directors_data)

#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)

#Let's have a look at the actors data
head(actors_data)

## [1] Auli'i Cravalho Mahershala Ali  Will Smith      Felicity Jones 
## [5] Eva Green       Ryan Gosling   
## 92 Levels: Aamir Khan Adam Driver Adam Sandler ... Zoey Deutch

#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')

#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)

#Let's have a look at the metascore 
head(metascore_data)

## [1] "81        " "99        " "40        " "65        " "57        "
## [6] "94        "

#Data-Preprocessing: removing extra space in metascore
metascore_data<-gsub(" ","",metascore_data)

#Lets check the length of metascore data
length(metascore_data)

## [1] 98

for (i in c(22,80)){

a<-metascore_data[1:(i-1)]

b<-metascore_data[i:length(metascore_data)]

metascore_data<-append(a,list("NA"))

metascore_data<-append(metascore_data,b)

}

#Data-Preprocessing: converting metascore to numerical
metascore_data<-as.numeric(metascore_data)

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

#Let's have another look at length of the metascore data

length(metascore_data)

## [1] 100

#Let's look at summary statistics
summary(metascore_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   47.25   62.00   60.19   73.50   99.00       2

Gross Revenue

#split out because this one gave me trouble. 

#Using CSS selectors to scrape the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')

#Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)

#Let's have a look at the votes data
head(gross_data)

## [1] "$248.76M" "$27.85M"  "$325.10M" "$532.18M" "$87.24M"  "$151.10M"

#Data-Preprocessing: removing '$' and 'M' signs
gross_data<-gsub("M","",gross_data)

gross_data<-substring(gross_data,2,6)

#Let's check the length of gross data
length(gross_data)

## [1] 90

#Filling missing entries with NA
for (i in c(22,48,52,63,72,84,91,93,94,100)){

a<-gross_data[1:(i-1)]

b<-gross_data[i:length(gross_data)]

gross_data<-append(a,list("NA"))

gross_data<-append(gross_data,b)

}

typeof(gross_data)

## [1] "list"

#Data-Preprocessing: converting gross to numerical
gross_data<-as.numeric(as.character(unlist(gross_data)))

## Warning: NAs introduced by coercion

#Let's have another look at the length of gross data
length(gross_data)

## [1] 101

summary(gross_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.18   27.36   57.64   99.12  125.80  532.10      10

gross_data

##   [1] 248.70  27.85 325.10 532.10  87.24 151.10 330.30 341.20 100.50  36.26
##  [11]  67.21 232.60 408.00 363.00   5.02  58.70 234.00   5.88   2.01 169.60
##  [21]   2.13     NA  93.43 138.20  56.25  54.65  10.64 126.60  34.34 158.80
##  [31]  52.85 155.40  47.70   1.33 100.00 270.40  86.26 103.10  89.22  35.82
##  [41]  97.69  51.74  14.43  75.40  26.86   7.70  61.43     NA 162.40 153.70
##  [51] 127.40     NA  31.15  65.08  30.08  47.37   4.21  35.59   8.58  55.12
##  [61]  72.08 102.40     NA   5.20   7.10 364.00 128.30  43.03  46.84  67.27
##  [71] 125.00     NA  30.35  60.32  66.18  26.41  40.10 486.30 113.20  12.39
##  [81]  12.79  26.83  82.05     NA   0.18  62.68  34.92   0.66   8.11 368.30
##  [91]     NA  21.59     NA     NA  31.89  46.01  10.91   2.14  57.64     NA
## [101]  57.64

#I don't know why it's duplicating the final value, maybe it's because the final value is an NA, but I'm just going to trim it.

gross_data<-gross_data[1:100]

gross_data

##   [1] 248.70  27.85 325.10 532.10  87.24 151.10 330.30 341.20 100.50  36.26
##  [11]  67.21 232.60 408.00 363.00   5.02  58.70 234.00   5.88   2.01 169.60
##  [21]   2.13     NA  93.43 138.20  56.25  54.65  10.64 126.60  34.34 158.80
##  [31]  52.85 155.40  47.70   1.33 100.00 270.40  86.26 103.10  89.22  35.82
##  [41]  97.69  51.74  14.43  75.40  26.86   7.70  61.43     NA 162.40 153.70
##  [51] 127.40     NA  31.15  65.08  30.08  47.37   4.21  35.59   8.58  55.12
##  [61]  72.08 102.40     NA   5.20   7.10 364.00 128.30  43.03  46.84  67.27
##  [71] 125.00     NA  30.35  60.32  66.18  26.41  40.10 486.30 113.20  12.39
##  [81]  12.79  26.83  82.05     NA   0.18  62.68  34.92   0.66   8.11 368.30
##  [91]     NA  21.59     NA     NA  31.89  46.01  10.91   2.14  57.64     NA

Combine into dataframe

#Combining all the lists to form a data frame
movies_df<-data.frame(Rank = rank_data, Title = title_data,

Description = description_data, Runtime = runtime_data,

Genre = genre_data, Rating = rating_data,

Metascore = metascore_data, Votes = votes_data,                                                             Gross_Earning_in_Mil = gross_data,

Director = directors_data, Actor = actors_data)

#Structure of the data frame

str(movies_df)

## 'data.frame':    100 obs. of  11 variables:
##  $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title               : Factor w/ 100 levels "10 Cloverfield Lane",..: 49 50 66 58 48 40 11 100 8 88 ...
##  $ Description         : Factor w/ 100 levels "    A deaf and mute writer who retreated into the woods to live a solitary life must fight for her life in sile"| __truncated__,..: 60 29 20 78 94 98 47 57 13 56 ...
##  $ Runtime             : num  107 111 123 133 127 128 151 108 116 116 ...
##  $ Genre               : Factor w/ 8 levels "Action","Adventure",..: 3 7 1 1 2 5 1 3 7 1 ...
##  $ Rating              : num  7.6 7.4 6 7.8 6.7 8 6.5 8 7.9 7.4 ...
##  $ Metascore           : num  81 99 40 65 57 94 44 78 81 70 ...
##  $ Votes               : num  254970 258637 580790 532957 150547 ...
##  $ Gross_Earning_in_Mil: num  248.7 27.9 325.1 532.1 87.2 ...
##  $ Director            : Factor w/ 98 levels "Alex Proyas",..: 82 11 25 35 92 20 98 14 29 86 ...
##  $ Actor               : Factor w/ 92 levels "Aamir Khan","Adam Driver",..: 8 52 89 32 31 72 9 34 5 70 ...

Analysis

qplot(data = movies_df,Runtime,fill = Genre,bins = 30)

ggplot(movies_df,aes(x=Runtime,y=Rating))+
geom_point(aes(size=Votes,col=Genre))

ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+
geom_point(aes(size=Rating,col=Genre))

## Warning: Removed 10 rows containing missing values (geom_point).

Questions:

Question 1: Based on the above data, which movie from which Genre had the longest runtime?

q1 <- movies_df %>%
  arrange(desc(Runtime)) %>%
  head()
q1

American Honey, a Drama, has the longest runtime 163 minutes

Question 2: Based on the above data, in the Runtime of 130-160 mins, which genre has the highest votes?

q2 <- movies_df %>%
  filter(Runtime > 129 & Runtime < 161) %>%
  group_by(Genre) %>%
  summarise(sum = sum(Votes)) %>%
  arrange(desc(sum))
q2

In the 130-160 runtime bracket, Action movies had the most votes.

Question 3: Based on the above data, across all genres which genre has the highest average gross earnings in runtime 100 to 120.

q3 <- movies_df %>%
  filter(Runtime > 99 & Runtime < 121) %>%
  group_by(Genre) %>%
  summarise(avg_gross = sum(Gross_Earning_in_Mil)) %>%
  arrange(desc(avg_gross))
q3

Animated films had the highest average gross in the 100-120 runtime bracket.

Webscraping Assignment