installed.packages("robotstxt")
##      Package LibPath Version Priority Depends Imports LinkingTo Suggests
##      Enhances License License_is_FOSS License_restricts_use OS_type Archs
##      MD5sum NeedsCompilation Built
library(robotstxt)
url <- "https://www.imdb.com/title/tt7235466/fullcredits"
is_allowed <- paths_allowed(url)
##  www.imdb.com
if (is_allowed) {
  print("Scraping is allowed for this page.")
} else {
  print("Scraping is NOT allowed for this page.")
}
## [1] "Scraping is allowed for this page."
library(rvest)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
tables <- html_nodes(webpage, "table")
series_cast_table <- html_table(tables[3], fill = TRUE)
library(rvest)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
tables <- html_nodes(webpage, "table")
cast_table <- html_table(tables[[3]], fill = TRUE)
clean_cast <- cast_table %>%
  select(2, 4) %>%  
  filter(!is.na(X2) & X2 != "", !is.na(X4) & X4 != "")  
clean_cast <- clean_cast %>% filter(X2 != "" & X4 != "")
num_rows <- nrow(clean_cast)
num_columns <- ncol(clean_cast)
cat("Final cleaned dataset contains:", num_rows, "observations and", num_columns, "columns.\n")
## Final cleaned dataset contains: 1577 observations and 2 columns.
head(clean_cast)
## # A tibble: 6 × 2
##   X2                   X4                                                       
##   <chr>                <chr>                                                    
## 1 Angela Bassett       "Athena Grant\n         / ...  \n                  115 e…
## 2 Peter Krause         "Bobby Nash\n                  115 episodes, 2018-2025"  
## 3 Oliver Stark         "Evan 'Buck' Buckley\n                  115 episodes, 20…
## 4 Aisha Hinds          "Henrietta 'Hen' Wilson\n                  115 episodes,…
## 5 Kenneth Choi         "Howie 'Chimney' Han\n                  115 episodes, 20…
## 6 Jennifer Love Hewitt "Maddie Kendall\n         / ...  \n                  105…
raw_visual <-html_element(webpage,"#fullcredits_content > table:nth-child(38)")
head(raw_visual)
## $node
## <pointer: 0x119a715c0>
## 
## $doc
## <pointer: 0x1035f6dd0>
parse_visual <- html_table(raw_visual)
head(parse_visual)
## # A tibble: 6 × 3
##   X1                     X2    X3                                               
##   <chr>                  <chr> <chr>                                            
## 1 Christian Zeiler       ...   digital compositor / digital compositor: FuseFX …
## 2 Katrina Duclos         ...   visual effects editor / visual effects editor: F…
## 3 Bryant Reif            ...   cg supervisor (50 episodes, 2019-2022)           
## 4 Tony Pirzadeh          ...   visual effects producer: FuseFX / visual effects…
## 5 Ezra Christian         ...   managing producer (46 episodes, 2021-2024)       
## 6 Timothy Michael Cairns ...   compositing supervisor (44 episodes, 2019-2022)