Introduction
We are using web scraping to collect data from IMDb for the 100 most popular feature films released in 2016 and graph the result.
Load libraries
library(rvest)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Web Scraping
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)
Scrape the data
rank_data <- webpage %>% html_nodes('.text-primary') %>% html_text()
title_data <- webpage %>% html_nodes('.lister-item-header a') %>% html_text()
description_data <- webpage %>% html_nodes('.text-muted:nth-child(3)') %>% html_text()
genre_data <- webpage %>% html_nodes('.genre') %>% html_text() %>% gsub('\\s+', '', .)
runtime_data <- webpage %>% html_nodes('.runtime') %>% html_text() %>% gsub(' min', '', .) %>% suppressWarnings(as.numeric(.))
rating_data <- webpage %>% html_nodes('.inline-block strong') %>% html_text() %>% suppressWarnings(as.numeric(.))
votes_data <- webpage %>% html_nodes("span[name='nv']") %>% html_text() %>% gsub(',', '', .) %>% suppressWarnings(as.numeric(.))
Cleaning the data and handle missing values
# Clean the vectors
runtime_data <- as.numeric(gsub("[^0-9]", "", runtime_data))
rating_data <- as.numeric(gsub("[^0-9.]", "", rating_data))
votes_data <- as.numeric(gsub("[^0-9]", "", votes_data))
# Remove NA values
runtime_data <- na.omit(runtime_data)
rating_data <- na.omit(rating_data)
votes_data <- na.omit(votes_data)
# clean the character vectors
title_data <- trimws(title_data)
description_data <- trimws(description_data)
genre_data <- trimws(genre_data)
# Make sure all vectors are of the same length
min_length <- min(length(rank_data), length(title_data), length(description_data), length(genre_data), length(runtime_data), length(rating_data), length(votes_data))
# Truncate all vectors to the minimum length
rank_data <- rank_data[1:min_length]
title_data <- title_data[1:min_length]
description_data <- description_data[1:min_length]
genre_data <- genre_data[1:min_length]
runtime_data <- runtime_data[1:min_length]
rating_data <- rating_data[1:min_length]
votes_data <- votes_data[1:min_length]
# data frame
movies_data <- data.frame(
Rank = as.integer(rank_data),
Title = as.character(title_data),
Description = as.character(description_data),
Genre = as.character(genre_data),
Runtime = runtime_data,
Rating = rating_data,
Votes = votes_data
)
movies_data <- na.omit(movies_data)
Building the graphs
First graph from tutorial: ibrary(‘ggplot2’)
qplot(data = movies_df,Runtime,fill = Genre,bins = 30)
movies_data$Runtime <- as.numeric(as.character(movies_data$Runtime))
common_genres <- names(sort(table(unlist(strsplit(movies_data$Genre, ","))), decreasing = TRUE)[1:10])
movies_data_long <- tidyr::separate_rows(movies_data, Genre, sep = ",")
movies_data_long <- dplyr::filter(movies_data_long, Genre %in% common_genres)
ggplot(movies_data_long, aes(x=Runtime, fill=Genre)) +
geom_histogram(position="identity", alpha=0.5, bins=30) +
labs(title="Distribution of Runtime by Genre for 100 Most Popular Movies of 2016",
x="Runtime",
y="Count",
fill="Genre") +
theme_minimal() +
theme(legend.position="top")
Second Graph From Tutorial: ggplot(movies_df,aes(x=Runtime,y=Rating))+ geom_point(aes(size=Votes,col=Genre))
ggplot(movies_data, aes(x = Runtime, y = Rating)) +
geom_point(aes(size = Votes, col = Genre), alpha = 0.6) +
scale_size_continuous(range = c(1, 10)) +
theme_minimal() +
labs(title = "Rating vs. Runtime of Movies",
x = "Runtime (in minutes)",
y = "Rating (out of 10)",
subtitle = "Size of points represent number of votes",
caption = "Source: IMDb") +
theme(legend.position = "None")
Third Graph From Tutorial: ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+ geom_point(aes(size=Rating,col=Genre))
plot <- ggplot(movies_data, aes(x=Runtime, y=Votes)) +
geom_point(aes(size=Rating, color=Genre), alpha=0.7) +
scale_size_continuous(range = c(2, 10)) +
labs(title="Runtime vs. Gross Earnings of Movies",
x="Runtime",
y="Gross Earning in Mil",
subtitle="Size of points represent movie ratings while colors represent different genres",
caption="Data Source: IMDb") +
theme_minimal() +
theme(legend.position="None")
print(plot)
Questions:
longest_runtime_movie <- movies_data %>% arrange(desc(Runtime)) %>% head(1)
longest_runtime_movie
## Rank Title Description Genre Runtime Rating Votes
## 1 3 Silence (I) (2016) Drama,History 161 7.2 32510
highest_votes_genre <- movies_data %>% filter(Runtime >= 130 & Runtime <= 160) %>%
group_by(Genre) %>%
summarise(TotalVotes = sum(Votes)) %>%
arrange(desc(TotalVotes)) %>%
head(1)
highest_votes_genre
## # A tibble: 1 × 2
## Genre TotalVotes
## <chr> <dbl>
## 1 Action,Adventure,Sci-Fi 486343
sum(is.na(movies_data$Runtime)) # Count how many NA values are there
## [1] 0
str(movies_data) # Check the structure of the data frame
## 'data.frame': 100 obs. of 7 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "Terrifier" "Suicide Squad" "Silence" "Hush" ...
## $ Description: chr "(2016)" "(2016)" "(I) (2016)" "(I) (2016)" ...
## $ Genre : chr "Horror,Thriller" "Action,Adventure,Fantasy" "Drama,History" "Horror,Thriller" ...
## $ Runtime : num 85 123 161 82 134 117 139 145 128 108 ...
## $ Rating : num 5.6 5.9 7.2 6.6 7.3 7.3 8.1 8.1 8 7.1 ...
## $ Votes : num 47833 710294 32510 119507 710 ...
head(movies_data) # Look at the first few rows of data
## Rank Title Description Genre Runtime Rating
## 1 1 Terrifier (2016) Horror,Thriller 85 5.6
## 2 2 Suicide Squad (2016) Action,Adventure,Fantasy 123 5.9
## 3 3 Silence (I) (2016) Drama,History 161 7.2
## 4 4 Hush (I) (2016) Horror,Thriller 82 6.6
## 5 5 The Conjuring 2 (2016) Horror,Mystery,Thriller 134 7.3
## 6 6 Split (IX) (2016) Horror,Thriller 117 7.3
## Votes
## 1 47833
## 2 710294
## 3 32510
## 4 119507
## 5 710
## 6 149333