Week 9 Homework

Introduction

We are using web scraping to collect data from IMDb for the 100 most popular feature films released in 2016 and graph the result.

Load libraries

library(rvest)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Web Scraping

url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)

Scrape the data

rank_data <- webpage %>% html_nodes('.text-primary') %>% html_text()
title_data <- webpage %>% html_nodes('.lister-item-header a') %>% html_text()
description_data <- webpage %>% html_nodes('.text-muted:nth-child(3)') %>% html_text()
genre_data <- webpage %>% html_nodes('.genre') %>% html_text() %>% gsub('\\s+', '', .)
runtime_data <- webpage %>% html_nodes('.runtime') %>% html_text() %>% gsub(' min', '', .) %>% suppressWarnings(as.numeric(.))
rating_data <- webpage %>% html_nodes('.inline-block strong') %>% html_text() %>% suppressWarnings(as.numeric(.))
votes_data <- webpage %>% html_nodes("span[name='nv']") %>% html_text() %>% gsub(',', '', .) %>% suppressWarnings(as.numeric(.))

Cleaning the data and handle missing values

# Clean the vectors 
runtime_data <- as.numeric(gsub("[^0-9]", "", runtime_data))
rating_data <- as.numeric(gsub("[^0-9.]", "", rating_data))
votes_data <- as.numeric(gsub("[^0-9]", "", votes_data))

# Remove NA values 
runtime_data <- na.omit(runtime_data)
rating_data <- na.omit(rating_data)
votes_data <- na.omit(votes_data)

# clean the character vectors
title_data <- trimws(title_data)
description_data <- trimws(description_data)
genre_data <- trimws(genre_data)

# Make sure all vectors are of the same length
min_length <- min(length(rank_data), length(title_data), length(description_data), length(genre_data), length(runtime_data), length(rating_data), length(votes_data))

# Truncate all vectors to the minimum length
rank_data <- rank_data[1:min_length]
title_data <- title_data[1:min_length]
description_data <- description_data[1:min_length]
genre_data <- genre_data[1:min_length]
runtime_data <- runtime_data[1:min_length]
rating_data <- rating_data[1:min_length]
votes_data <- votes_data[1:min_length]


# data frame
movies_data <- data.frame(
  Rank = as.integer(rank_data),
  Title = as.character(title_data),
  Description = as.character(description_data),
  Genre = as.character(genre_data),
  Runtime = runtime_data,
  Rating = rating_data,
  Votes = votes_data
)

movies_data <- na.omit(movies_data)

Building the graphs

First graph from tutorial: ibrary(‘ggplot2’)

qplot(data = movies_df,Runtime,fill = Genre,bins = 30)

movies_data$Runtime <- as.numeric(as.character(movies_data$Runtime))

common_genres <- names(sort(table(unlist(strsplit(movies_data$Genre, ","))), decreasing = TRUE)[1:10])

movies_data_long <- tidyr::separate_rows(movies_data, Genre, sep = ",")

movies_data_long <- dplyr::filter(movies_data_long, Genre %in% common_genres)

ggplot(movies_data_long, aes(x=Runtime, fill=Genre)) +
  geom_histogram(position="identity", alpha=0.5, bins=30) +
  labs(title="Distribution of Runtime by Genre for 100 Most Popular Movies of 2016",
       x="Runtime",
       y="Count",
       fill="Genre") +
  theme_minimal() +
  theme(legend.position="top")

Second Graph From Tutorial: ggplot(movies_df,aes(x=Runtime,y=Rating))+ geom_point(aes(size=Votes,col=Genre))

ggplot(movies_data, aes(x = Runtime, y = Rating)) +  
  geom_point(aes(size = Votes, col = Genre), alpha = 0.6) +  
  scale_size_continuous(range = c(1, 10)) + 
  theme_minimal() +  
  labs(title = "Rating vs. Runtime of Movies", 
       x = "Runtime (in minutes)", 
       y = "Rating (out of 10)", 
       subtitle = "Size of points represent number of votes",
       caption = "Source: IMDb") +
  theme(legend.position = "None")

Third Graph From Tutorial: ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+ geom_point(aes(size=Rating,col=Genre))

plot <- ggplot(movies_data, aes(x=Runtime, y=Votes)) + 
  geom_point(aes(size=Rating, color=Genre), alpha=0.7) + 
  scale_size_continuous(range = c(2, 10)) + 
  labs(title="Runtime vs. Gross Earnings of Movies", 
       x="Runtime", 
       y="Gross Earning in Mil", 
       subtitle="Size of points represent movie ratings while colors represent different genres",
       caption="Data Source: IMDb") +
  theme_minimal() + 

  theme(legend.position="None") 

print(plot)

Questions:

Which movie from which Genre had the longest runtime?

longest_runtime_movie <- movies_data %>% arrange(desc(Runtime)) %>% head(1)
longest_runtime_movie

##   Rank   Title Description         Genre Runtime Rating Votes
## 1    3 Silence  (I) (2016) Drama,History     161    7.2 32510

In the Runtime of 130-160 minutes, which genre has the highest votes?

highest_votes_genre <- movies_data %>% filter(Runtime >= 130 & Runtime <= 160) %>% 
                      group_by(Genre) %>% 
                      summarise(TotalVotes = sum(Votes)) %>% 
                      arrange(desc(TotalVotes)) %>% 
                      head(1)
highest_votes_genre

## # A tibble: 1 × 2
##   Genre                   TotalVotes
##   <chr>                        <dbl>
## 1 Action,Adventure,Sci-Fi     486343

Accross all genres, which genre has the highest average gross earnings in runtime 100 to 120?

sum(is.na(movies_data$Runtime))  # Count how many NA values are there

## [1] 0

str(movies_data)  # Check the structure of the data frame

## 'data.frame':    100 obs. of  7 variables:
##  $ Rank       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title      : chr  "Terrifier" "Suicide Squad" "Silence" "Hush" ...
##  $ Description: chr  "(2016)" "(2016)" "(I) (2016)" "(I) (2016)" ...
##  $ Genre      : chr  "Horror,Thriller" "Action,Adventure,Fantasy" "Drama,History" "Horror,Thriller" ...
##  $ Runtime    : num  85 123 161 82 134 117 139 145 128 108 ...
##  $ Rating     : num  5.6 5.9 7.2 6.6 7.3 7.3 8.1 8.1 8 7.1 ...
##  $ Votes      : num  47833 710294 32510 119507 710 ...

head(movies_data)  # Look at the first few rows of data

##   Rank           Title Description                    Genre Runtime Rating
## 1    1       Terrifier      (2016)          Horror,Thriller      85    5.6
## 2    2   Suicide Squad      (2016) Action,Adventure,Fantasy     123    5.9
## 3    3         Silence  (I) (2016)            Drama,History     161    7.2
## 4    4            Hush  (I) (2016)          Horror,Thriller      82    6.6
## 5    5 The Conjuring 2      (2016)  Horror,Mystery,Thriller     134    7.3
## 6    6           Split (IX) (2016)          Horror,Thriller     117    7.3
##    Votes
## 1  47833
## 2 710294
## 3  32510
## 4 119507
## 5    710
## 6 149333

Week 9 Homework

xutong

2023-10-31