# Scraping IMDB ALL TIME TOP RATED 250 MOVIES WITH RATING
# loading rvesr package
# install.packages('rvest')
library(rvest)
## Loading required package: xml2
# specifying desired wesbsite to be scrapped
url <- 'https://www.imdb.com/chart/top?sort=rk,asc&mode=simple&page=1'
# Reading the HTML code from website
imdb <- read_html(url)
# selector to scrap the Rating section
rank_data_html <- html_nodes(imdb,'strong')
# converting Rating data to text
rank_data <- html_text(rank_data_html)
# Rating data format
head(rank_data)
## [1] " Mad Max: Fury Road\n"
## [2] "203"
## [3] "Harrison Ford"
## [4] "103"
## [5] "9.2"
## [6] "9.2"
# checking class
class(rank_data)
## [1] "character"
# Data Preprocessing : coverting ranking to numerical and removing 1st 4 rows
rank_data <- as.numeric(rank_data[c(5:254)])
# data after preprocessing
head(rank_data)
## [1] 9.2 9.2 9.0 9.0 8.9 8.9
# selector to scrap the Movie Titles section
title_data_html <- html_nodes(imdb,'.titleColumn a')
# Converting the title data to text
title_data <- html_text(title_data_html)
# title data
head(title_data)
## [1] "The Shawshank Redemption" "The Godfather"
## [3] "The Godfather: Part II" "The Dark Knight"
## [5] "12 Angry Men" "Schindler's List"
# combining all the lists to form a data frame
Top_movies <- data.frame(Title = title_data ,Rating = rank_data)
# checking NA's in scraped data
colSums(is.na(Top_movies))
## Title Rating
## 0 0
# structure of data frame
str(Top_movies)
## 'data.frame': 250 obs. of 2 variables:
## $ Title : Factor w/ 250 levels "12 Angry Men",..: 219 198 199 192 1 166 212 149 83 60 ...
## $ Rating: num 9.2 9.2 9 9 8.9 8.9 8.9 8.9 8.8 8.8 ...
# To see Top 50 movies
head(Top_movies,50)
## Title Rating
## 1 The Shawshank Redemption 9.2
## 2 The Godfather 9.2
## 3 The Godfather: Part II 9.0
## 4 The Dark Knight 9.0
## 5 12 Angry Men 8.9
## 6 Schindler's List 8.9
## 7 The Lord of the Rings: The Return of the King 8.9
## 8 Pulp Fiction 8.9
## 9 Il buono, il brutto, il cattivo 8.8
## 10 Fight Club 8.8
## 11 The Lord of the Rings: The Fellowship of the Ring 8.8
## 12 Forrest Gump 8.7
## 13 Star Wars: Episode V - The Empire Strikes Back 8.7
## 14 Inception 8.7
## 15 The Lord of the Rings: The Two Towers 8.7
## 16 One Flew Over the Cuckoo's Nest 8.7
## 17 Goodfellas 8.7
## 18 The Matrix 8.6
## 19 Shichinin no samurai 8.6
## 20 Avengers: Infinity War 8.6
## 21 Cidade de Deus 8.6
## 22 Star Wars 8.6
## 23 Se7en 8.6
## 24 The Silence of the Lambs 8.6
## 25 It's a Wonderful Life 8.6
## 26 La vita è bella 8.6
## 27 The Usual Suspects 8.6
## 28 Sen to Chihiro no kamikakushi 8.5
## 29 Saving Private Ryan 8.5
## 30 Léon 8.5
## 31 The Green Mile 8.5
## 32 Interstellar 8.5
## 33 American History X 8.5
## 34 Psycho 8.5
## 35 Once Upon a Time in the West 8.5
## 36 City Lights 8.5
## 37 Casablanca 8.5
## 38 Modern Times 8.5
## 39 The Intouchables 8.5
## 40 The Pianist 8.5
## 41 The Departed 8.5
## 42 Terminator 2: Judgment Day 8.5
## 43 Back to the Future 8.5
## 44 Raiders of the Lost Ark 8.5
## 45 Rear Window 8.5
## 46 Whiplash 8.5
## 47 Gladiator 8.5
## 48 The Lion King 8.5
## 49 The Prestige 8.5
## 50 Memento 8.4
# To see all 250 movie list
View(Top_movies)
# Saving result in csv file
# getwd() To check working directory
write.csv(Top_movies,file = "IMDb_Movies.csv",row.names = FALSE)