library(robotstxt)
is_allowed <- paths_allowed(url)
www.imdb.com
if (is_allowed) {
print("Scraping is allowed for this page.")
} else {
print("Scraping is NOT allowed for this page.")
}
[1] "Scraping is allowed for this page."
library(rvest)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
tables <- html_nodes(webpage, "table")
series_cast_table <- html_table(tables[3], fill = TRUE)
dim(series_cast_table)
NULL
library(rvest)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
tables <- html_nodes(webpage, "table")
cast_table <- html_table(tables[[3]], fill = TRUE)
clean_cast <- cast_table %>%
select(2, 4) %>%
filter(!is.na(X2) & X2 != "", !is.na(X4) & X4 != "")
clean_cast <- clean_cast %>% filter(X2 != "" & X4 != "")
Final:
num_rows <- nrow(clean_cast)
num_columns <- ncol(clean_cast)
cat("Final cleaned dataset contains:", num_rows, "observations and", num_columns, "columns.\n")
Final cleaned dataset contains: 1575 observations and 2 columns.
Clean Display:
head(clean_cast)
LS0tCnRpdGxlOiAiQXNzaWdubWVudCAzIE93ZW4gSGVuZGVyc29uICIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQpgYGB7cn0KbGlicmFyeShyb2JvdHN0eHQpCmBgYAoKYGBge3J9CmlzX2FsbG93ZWQgPC0gcGF0aHNfYWxsb3dlZCh1cmwpCmBgYApgYGB7cn0KaWYgKGlzX2FsbG93ZWQpIHsKICBwcmludCgiU2NyYXBpbmcgaXMgYWxsb3dlZCBmb3IgdGhpcyBwYWdlLiIpCn0gZWxzZSB7CiAgcHJpbnQoIlNjcmFwaW5nIGlzIE5PVCBhbGxvd2VkIGZvciB0aGlzIHBhZ2UuIikKfQpgYGAKCmBgYHtyfQpsaWJyYXJ5KHJ2ZXN0KQp1cmwgPC0gImh0dHBzOi8vd3d3LmltZGIuY29tL3RpdGxlL3R0NzIzNTQ2Ni9mdWxsY3JlZGl0cz9yZWZfPXR0X2NsX3NtIgp3ZWJwYWdlIDwtIHJlYWRfaHRtbCh1cmwpCmBgYAoKYGBge3J9CnRhYmxlcyA8LSBodG1sX25vZGVzKHdlYnBhZ2UsICJ0YWJsZSIpCmBgYAoKYGBge3J9CnNlcmllc19jYXN0X3RhYmxlIDwtIGh0bWxfdGFibGUodGFibGVzWzNdLCBmaWxsID0gVFJVRSkKYGBgCgpgYGB7cn0KZGltKHNlcmllc19jYXN0X3RhYmxlKQpgYGAKCmBgYHtyfQpsaWJyYXJ5KHJ2ZXN0KQpsaWJyYXJ5KGRwbHlyKQpgYGAKYGBge3J9CnVybCA8LSAiaHR0cHM6Ly93d3cuaW1kYi5jb20vdGl0bGUvdHQ3MjM1NDY2L2Z1bGxjcmVkaXRzP3JlZl89dHRfY2xfc20iCmBgYAoKYGBge3J9CndlYnBhZ2UgPC0gcmVhZF9odG1sKHVybCkKYGBgCgpgYGB7cn0KdGFibGVzIDwtIGh0bWxfbm9kZXMod2VicGFnZSwgInRhYmxlIikKYGBgCgpgYGB7cn0KY2FzdF90YWJsZSA8LSBodG1sX3RhYmxlKHRhYmxlc1tbM11dLCBmaWxsID0gVFJVRSkKYGBgCgpgYGB7cn0KY2xlYW5fY2FzdCA8LSBjYXN0X3RhYmxlICU+JQogIHNlbGVjdCgyLCA0KSAlPiUgIAogIGZpbHRlcighaXMubmEoWDIpICYgWDIgIT0gIiIsICFpcy5uYShYNCkgJiBYNCAhPSAiIikgIAoKYGBgCgpgYGB7cn0KY2xlYW5fY2FzdCA8LSBjbGVhbl9jYXN0ICU+JSBmaWx0ZXIoWDIgIT0gIiIgJiBYNCAhPSAiIikKYGBgCgpGaW5hbDoKYGBge3J9Cm51bV9yb3dzIDwtIG5yb3coY2xlYW5fY2FzdCkKbnVtX2NvbHVtbnMgPC0gbmNvbChjbGVhbl9jYXN0KQpjYXQoIkZpbmFsIGNsZWFuZWQgZGF0YXNldCBjb250YWluczoiLCBudW1fcm93cywgIm9ic2VydmF0aW9ucyBhbmQiLCBudW1fY29sdW1ucywgImNvbHVtbnMuXG4iKQoKYGBgCgpDbGVhbiBEaXNwbGF5OgpgYGB7cn0KaGVhZChjbGVhbl9jYXN0KQpgYGAKCg==