R Notebook

Charlie Stevens

library(rvest)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tibble)

Question 6:

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
page <- read_html(url)
tables <- page %>% html_elements("table")
series_cast_table <- tables[[3]] %>% html_table()
head(series_cast_table)

dim(series_cast_table)

## [1] 3152    4

Question 7:

cleaned_table <- series_cast_table %>%
  select(X2, X4) %>%
  filter(!(is.na(X2) | X2 == "" | is.na(X4) | X4 == "")) %>%
  distinct()

dim(cleaned_table)

## [1] 1575    2

Question 8:

tibble_name <- tibble(A = 1:3, B = 4:6)
print(tibble_name)

## # A tibble: 3 × 2
##       A     B
##   <int> <int>
## 1     1     4
## 2     2     5
## 3     3     6

colnames(tibble_name) <- c("v1", "v2")
print(tibble_name)

## # A tibble: 3 × 2
##      v1    v2
##   <int> <int>
## 1     1     4
## 2     2     5
## 3     3     6

Answer: A and D

Question 9:

library(rvest)

# IMDb full credits URL
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"

# Read the webpage
page <- read_html(url)

# Extract the 38th table body
vfx_table <- page %>% html_element("#fullcredits_content > table:nth-child(38) > tbody")

# Extract names from table rows (adjust the selector if needed)
vfx_names <- vfx_table %>% html_elements("tr td:nth-child(1)") %>% html_text(trim = TRUE)

# Count number of extracted names
length(vfx_names)

## [1] 196