Top_IMDb_250_movies.R

# Scraping IMDB ALL TIME TOP RATED 250  MOVIES WITH RATING 

# loading rvesr package
# install.packages('rvest')
library(rvest)

## Loading required package: xml2

# specifying desired wesbsite to be scrapped
url <- 'https://www.imdb.com/chart/top?sort=rk,asc&mode=simple&page=1'

# Reading the HTML code from website

imdb <- read_html(url)

#  selector to scrap the Rating section 
rank_data_html <- html_nodes(imdb,'strong')

# converting Rating data to text
rank_data <- html_text(rank_data_html)

#  Rating data format
head(rank_data)

## [1] "                            Mad Max: Fury Road\n"
## [2] "203"                                             
## [3] "Harrison Ford"                                   
## [4] "103"                                             
## [5] "9.2"                                             
## [6] "9.2"

# checking class
class(rank_data)

## [1] "character"

# Data Preprocessing : coverting ranking to numerical and removing 1st 4 rows 
rank_data <- as.numeric(rank_data[c(5:254)])

# data after preprocessing 
head(rank_data)

## [1] 9.2 9.2 9.0 9.0 8.9 8.9

# selector to scrap the Movie Titles section

title_data_html <- html_nodes(imdb,'.titleColumn a')

# Converting the title data to text
title_data <- html_text(title_data_html)

# title data
head(title_data)

## [1] "The Shawshank Redemption" "The Godfather"           
## [3] "The Godfather: Part II"   "The Dark Knight"         
## [5] "12 Angry Men"             "Schindler's List"

# combining all the lists to form a data frame
Top_movies <- data.frame(Title = title_data ,Rating = rank_data)

# checking NA's in scraped data 
colSums(is.na(Top_movies))

##  Title Rating 
##      0      0

# structure of data frame
str(Top_movies)

## 'data.frame':    250 obs. of  2 variables:
##  $ Title : Factor w/ 250 levels "12 Angry Men",..: 219 198 199 192 1 166 212 149 83 60 ...
##  $ Rating: num  9.2 9.2 9 9 8.9 8.9 8.9 8.9 8.8 8.8 ...

# To see Top 50  movies
head(Top_movies,50)

##                                                Title Rating
## 1                           The Shawshank Redemption    9.2
## 2                                      The Godfather    9.2
## 3                             The Godfather: Part II    9.0
## 4                                    The Dark Knight    9.0
## 5                                       12 Angry Men    8.9
## 6                                   Schindler's List    8.9
## 7      The Lord of the Rings: The Return of the King    8.9
## 8                                       Pulp Fiction    8.9
## 9                    Il buono, il brutto, il cattivo    8.8
## 10                                        Fight Club    8.8
## 11 The Lord of the Rings: The Fellowship of the Ring    8.8
## 12                                      Forrest Gump    8.7
## 13    Star Wars: Episode V - The Empire Strikes Back    8.7
## 14                                         Inception    8.7
## 15             The Lord of the Rings: The Two Towers    8.7
## 16                   One Flew Over the Cuckoo's Nest    8.7
## 17                                        Goodfellas    8.7
## 18                                        The Matrix    8.6
## 19                              Shichinin no samurai    8.6
## 20                            Avengers: Infinity War    8.6
## 21                                    Cidade de Deus    8.6
## 22                                         Star Wars    8.6
## 23                                             Se7en    8.6
## 24                          The Silence of the Lambs    8.6
## 25                             It's a Wonderful Life    8.6
## 26                                   La vita è bella    8.6
## 27                                The Usual Suspects    8.6
## 28                     Sen to Chihiro no kamikakushi    8.5
## 29                               Saving Private Ryan    8.5
## 30                                              Léon    8.5
## 31                                    The Green Mile    8.5
## 32                                      Interstellar    8.5
## 33                                American History X    8.5
## 34                                            Psycho    8.5
## 35                      Once Upon a Time in the West    8.5
## 36                                       City Lights    8.5
## 37                                        Casablanca    8.5
## 38                                      Modern Times    8.5
## 39                                  The Intouchables    8.5
## 40                                       The Pianist    8.5
## 41                                      The Departed    8.5
## 42                        Terminator 2: Judgment Day    8.5
## 43                                Back to the Future    8.5
## 44                           Raiders of the Lost Ark    8.5
## 45                                       Rear Window    8.5
## 46                                          Whiplash    8.5
## 47                                         Gladiator    8.5
## 48                                     The Lion King    8.5
## 49                                      The Prestige    8.5
## 50                                           Memento    8.4

# To see all 250 movie list
View(Top_movies)

# Saving result in csv file 
# getwd() To check working directory

write.csv(Top_movies,file = "IMDb_Movies.csv",row.names = FALSE)

Top_IMDb_250_movies.R

DigaNT

Sun Jun 10 11:58:37 2018