library(robotstxt)
## Warning: package 'robotstxt' was built under R version 4.4.2
library(rvest)
5) Is scraping this web page allowed?
paths_allowed("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
## www.imdb.com
## [1] TRUE
6) How many rows and columns are in the table?
bas_html <- read_html("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
bas_html
## {html_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
table_html <- html_elements(bas_html, "table")
table_html[3]
## {xml_nodeset (1)}
## [1] <table class="cast_list">\n<tr><td colspan="4" class="castlist_label"></t ...
tibble_list <- html_table(table_html[3])
tibble_list
## [[1]]
## # A tibble: 3,150 × 4
## X1 X2 X3 X4
## <lgl> <chr> <chr> <chr>
## 1 NA "" "" ""
## 2 NA "Angela Bassett" "..." "Athena Grant\n / ... \n …
## 3 NA "" "" ""
## 4 NA "Peter Krause" "..." "Bobby Nash\n 115 episodes, 20…
## 5 NA "" "" ""
## 6 NA "Oliver Stark" "..." "Evan 'Buck' Buckley\n 115 epi…
## 7 NA "" "" ""
## 8 NA "Aisha Hinds" "..." "Henrietta 'Hen' Wilson\n 115 …
## 9 NA "" "" ""
## 10 NA "Kenneth Choi" "..." "Howie 'Chimney' Han\n 115 epi…
## # ℹ 3,140 more rows
eastern_tibble <- tibble_list[[1]]
eastern_tibble
7) Clean data and find exact number of people in cast.
cast_df <- eastern_tibble[, c(2, 4)]
cast_df <- subset(cast_df, cast_df[[1]] != "" & cast_df[[2]] != "")
cast_df <- cast_df[apply(cast_df, 1, function(row) !all(row == "")), ]
dim(cast_df)
## [1] 1574 2
8) Series Visual Effects staff names.
library(rvest)
page <- read_html("https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm")
effects_section <- page %>% html_elements(xpath = "//*[contains(text(), 'Series Visual Effects')]")
if (length(effects_section) > 0) {
effects_table <- effects_section[[1]] %>%
html_element(xpath = "./following::table[1]") # Selects the next table after the heading
if (!is.null(effects_table)) {
visual_effects_df <- effects_table %>% html_table(fill = TRUE)
visual_effects_df <- visual_effects_df[apply(visual_effects_df, 1, function(row) any(nzchar(as.character(row)))), ]
num_staff <- nrow(visual_effects_df)
print(num_staff)
} else {
print("Table not found. Check the webpage structure.")
}
} else {
print("Section 'Series Visual Effects' not found. Verify XPath.")
}
## [1] 196