Introduction

We are using web scraping to collect data from IMDb for the 100 most popular feature films released in 2016 and graph the result.

Load libraries

library(rvest)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Web Scraping

url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)

Scrape the data

rank_data <- webpage %>% html_nodes('.text-primary') %>% html_text()
title_data <- webpage %>% html_nodes('.lister-item-header a') %>% html_text()
description_data <- webpage %>% html_nodes('.text-muted:nth-child(3)') %>% html_text()
genre_data <- webpage %>% html_nodes('.genre') %>% html_text() %>% gsub('\\s+', '', .)
runtime_data <- webpage %>% html_nodes('.runtime') %>% html_text() %>% gsub(' min', '', .) %>% suppressWarnings(as.numeric(.))
rating_data <- webpage %>% html_nodes('.inline-block strong') %>% html_text() %>% suppressWarnings(as.numeric(.))
votes_data <- webpage %>% html_nodes("span[name='nv']") %>% html_text() %>% gsub(',', '', .) %>% suppressWarnings(as.numeric(.))

Cleaning the data and handle missing values

# Clean the vectors 
runtime_data <- as.numeric(gsub("[^0-9]", "", runtime_data))
rating_data <- as.numeric(gsub("[^0-9.]", "", rating_data))
votes_data <- as.numeric(gsub("[^0-9]", "", votes_data))

# Remove NA values 
runtime_data <- na.omit(runtime_data)
rating_data <- na.omit(rating_data)
votes_data <- na.omit(votes_data)

# clean the character vectors
title_data <- trimws(title_data)
description_data <- trimws(description_data)
genre_data <- trimws(genre_data)

# Make sure all vectors are of the same length
min_length <- min(length(rank_data), length(title_data), length(description_data), length(genre_data), length(runtime_data), length(rating_data), length(votes_data))

# Truncate all vectors to the minimum length
rank_data <- rank_data[1:min_length]
title_data <- title_data[1:min_length]
description_data <- description_data[1:min_length]
genre_data <- genre_data[1:min_length]
runtime_data <- runtime_data[1:min_length]
rating_data <- rating_data[1:min_length]
votes_data <- votes_data[1:min_length]


# data frame
movies_data <- data.frame(
  Rank = as.integer(rank_data),
  Title = as.character(title_data),
  Description = as.character(description_data),
  Genre = as.character(genre_data),
  Runtime = runtime_data,
  Rating = rating_data,
  Votes = votes_data
)

movies_data <- na.omit(movies_data)

Building the graphs

First graph from tutorial: ibrary(‘ggplot2’)

qplot(data = movies_df,Runtime,fill = Genre,bins = 30)

movies_data$Runtime <- as.numeric(as.character(movies_data$Runtime))

common_genres <- names(sort(table(unlist(strsplit(movies_data$Genre, ","))), decreasing = TRUE)[1:10])

movies_data_long <- tidyr::separate_rows(movies_data, Genre, sep = ",")

movies_data_long <- dplyr::filter(movies_data_long, Genre %in% common_genres)

ggplot(movies_data_long, aes(x=Runtime, fill=Genre)) +
  geom_histogram(position="identity", alpha=0.5, bins=30) +
  labs(title="Distribution of Runtime by Genre for 100 Most Popular Movies of 2016",
       x="Runtime",
       y="Count",
       fill="Genre") +
  theme_minimal() +
  theme(legend.position="top")

Second Graph From Tutorial: ggplot(movies_df,aes(x=Runtime,y=Rating))+ geom_point(aes(size=Votes,col=Genre))

ggplot(movies_data, aes(x = Runtime, y = Rating)) +  
  geom_point(aes(size = Votes, col = Genre), alpha = 0.6) +  
  scale_size_continuous(range = c(1, 10)) + 
  theme_minimal() +  
  labs(title = "Rating vs. Runtime of Movies", 
       x = "Runtime (in minutes)", 
       y = "Rating (out of 10)", 
       subtitle = "Size of points represent number of votes",
       caption = "Source: IMDb") +
  theme(legend.position = "None")  

Third Graph From Tutorial: ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+ geom_point(aes(size=Rating,col=Genre))

plot <- ggplot(movies_data, aes(x=Runtime, y=Votes)) + 
  geom_point(aes(size=Rating, color=Genre), alpha=0.7) + 
  scale_size_continuous(range = c(2, 10)) + 
  labs(title="Runtime vs. Gross Earnings of Movies", 
       x="Runtime", 
       y="Gross Earning in Mil", 
       subtitle="Size of points represent movie ratings while colors represent different genres",
       caption="Data Source: IMDb") +
  theme_minimal() + 

  theme(legend.position="None") 

print(plot)

Questions:

  1. Which movie from which Genre had the longest runtime?
longest_runtime_movie <- movies_data %>% arrange(desc(Runtime)) %>% head(1)
longest_runtime_movie
##   Rank   Title Description         Genre Runtime Rating Votes
## 1    3 Silence  (I) (2016) Drama,History     161    7.2 32510
  1. In the Runtime of 130-160 minutes, which genre has the highest votes?
highest_votes_genre <- movies_data %>% filter(Runtime >= 130 & Runtime <= 160) %>% 
                      group_by(Genre) %>% 
                      summarise(TotalVotes = sum(Votes)) %>% 
                      arrange(desc(TotalVotes)) %>% 
                      head(1)
highest_votes_genre
## # A tibble: 1 × 2
##   Genre                   TotalVotes
##   <chr>                        <dbl>
## 1 Action,Adventure,Sci-Fi     486343
  1. Accross all genres, which genre has the highest average gross earnings in runtime 100 to 120?
sum(is.na(movies_data$Runtime))  # Count how many NA values are there
## [1] 0
str(movies_data)  # Check the structure of the data frame
## 'data.frame':    100 obs. of  7 variables:
##  $ Rank       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title      : chr  "Terrifier" "Suicide Squad" "Silence" "Hush" ...
##  $ Description: chr  "(2016)" "(2016)" "(I) (2016)" "(I) (2016)" ...
##  $ Genre      : chr  "Horror,Thriller" "Action,Adventure,Fantasy" "Drama,History" "Horror,Thriller" ...
##  $ Runtime    : num  85 123 161 82 134 117 139 145 128 108 ...
##  $ Rating     : num  5.6 5.9 7.2 6.6 7.3 7.3 8.1 8.1 8 7.1 ...
##  $ Votes      : num  47833 710294 32510 119507 710 ...
head(movies_data)  # Look at the first few rows of data
##   Rank           Title Description                    Genre Runtime Rating
## 1    1       Terrifier      (2016)          Horror,Thriller      85    5.6
## 2    2   Suicide Squad      (2016) Action,Adventure,Fantasy     123    5.9
## 3    3         Silence  (I) (2016)            Drama,History     161    7.2
## 4    4            Hush  (I) (2016)          Horror,Thriller      82    6.6
## 5    5 The Conjuring 2      (2016)  Horror,Mystery,Thriller     134    7.3
## 6    6           Split (IX) (2016)          Horror,Thriller     117    7.3
##    Votes
## 1  47833
## 2 710294
## 3  32510
## 4 119507
## 5    710
## 6 149333