library(robotstxt)
Url to check
url <- "https://www.imdb.com/title/tt7235466/fullcredits"
Path
is_allowed <- paths_allowed(url)
## www.imdb.com
if (is_allowed) {
print("Scraping is allowed for this page.")
} else {
print("Scraping is NOT allowed for this page.")
}
## [1] "Scraping is allowed for this page."
library(rvest)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
Tables
tables <- html_nodes(webpage, "table")
3rd table
series_cast_table <- html_table(tables[3], fill = TRUE)
Rows and columns
dim(series_cast_table)
## NULL
library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
Html content
webpage <- read_html(url)
Tables from page
tables <- html_nodes(webpage, "table")
3rd table
cast_table <- html_table(tables[[3]], fill = TRUE)
Cleaned
clean_cast <- cast_table %>%
select(2, 4) %>%
filter(!is.na(X2) & X2 != "", !is.na(X4) & X4 != "")
Tail
clean_cast <- clean_cast %>% filter(X2 != "" & X4 != "")
Display Final
num_rows <- nrow(clean_cast)
num_columns <- ncol(clean_cast)
cat("Final cleaned dataset contains:", num_rows, "observations and", num_columns, "columns.\n")
## Final cleaned dataset contains: 1575 observations and 2 columns.
Display clean
head(clean_cast)
## # A tibble: 6 × 2
## X2 X4
## <chr> <chr>
## 1 Angela Bassett "Athena Grant\n / ... \n 115 e…
## 2 Peter Krause "Bobby Nash\n 115 episodes, 2018-2025"
## 3 Oliver Stark "Evan 'Buck' Buckley\n 115 episodes, 20…
## 4 Aisha Hinds "Henrietta 'Hen' Wilson\n 115 episodes,…
## 5 Kenneth Choi "Howie 'Chimney' Han\n 115 episodes, 20…
## 6 Jennifer Love Hewitt "Maddie Kendall\n / ... \n 105…