library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url <- "https://www.basketball-reference.com/boxscores/?month=02&day=14&year=2025"
webpage <- read_html(url)
table_node <- html_element(webpage, "table")
bball_table <- html_table(table_node, fill = TRUE)
print(bball_table)
## # A tibble: 15 × 7
## `Eastern Conference` W L `W/L%` GB `PS/G` `PA/G`
## <chr> <int> <int> <dbl> <chr> <dbl> <dbl>
## 1 Cleveland Cavaliers 44 10 0.815 — 123. 112.
## 2 Boston Celtics 39 16 0.709 5.5 117. 108.
## 3 New York Knicks 36 18 0.667 8.0 118. 112.
## 4 Indiana Pacers 30 23 0.566 13.5 116 116.
## 5 Milwaukee Bucks 29 24 0.547 14.5 114. 113.
## 6 Detroit Pistons 29 26 0.527 15.5 114. 113.
## 7 Orlando Magic 27 29 0.482 18.0 104 105.
## 8 Atlanta Hawks 26 29 0.473 18.5 117. 119.
## 9 Miami Heat 25 28 0.472 18.5 110. 111.
## 10 Chicago Bulls 22 33 0.4 22.5 116 121.
## 11 Brooklyn Nets 20 34 0.37 24.0 105 111
## 12 Philadelphia 76ers 20 34 0.37 24.0 109. 113.
## 13 Toronto Raptors 17 38 0.309 27.5 111. 116.
## 14 Charlotte Hornets 13 39 0.25 30.0 106. 112.
## 15 Washington Wizards 9 45 0.167 35.0 109. 122
install.packages("robotstxt", repos = "https://cran.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/x5/86kchw4505n33hzsz9x480jh0000gn/T//RtmpAWkfSg/downloaded_packages
library(robotstxt)
## Warning: package 'robotstxt' was built under R version 4.3.3
url_to_check <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
domain <- "https://www.imdb.com/"
is_allowed <- paths_allowed(paths = url_to_check, domain = domain)
##
https://www.imdb.com/
if (is_allowed) {
print("Scraping is allowed: TRUE")
} else {
print("Scraping is disallowed: FALSE")
}
## [1] "Scraping is allowed: TRUE"
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
tables <- html_nodes(webpage, "table")
cast_table <- html_table(tables[[3]], fill = TRUE)
str(cast_table)
## tibble [3,152 × 4] (S3: tbl_df/tbl/data.frame)
## $ X1: logi [1:3152] NA NA NA NA NA NA ...
## $ X2: chr [1:3152] "" "Angela Bassett" "" "Peter Krause" ...
## $ X3: chr [1:3152] "" "..." "" "..." ...
## $ X4: chr [1:3152] "" "Athena Grant\n / ... \n 115 episodes, 2018-2025" "" "Bobby Nash\n 115 episodes, 2018-2025" ...
head(cast_table)
## # A tibble: 6 × 4
## X1 X2 X3 X4
## <lgl> <chr> <chr> <chr>
## 1 NA "" "" ""
## 2 NA "Angela Bassett" "..." "Athena Grant\n / ... \n …
## 3 NA "" "" ""
## 4 NA "Peter Krause" "..." "Bobby Nash\n 115 episodes, 201…
## 5 NA "" "" ""
## 6 NA "Oliver Stark" "..." "Evan 'Buck' Buckley\n 115 epis…
rows <- nrow(cast_table)
cols <- ncol(cast_table)
cat("Number of Rows:", rows, "\n")
## Number of Rows: 3152
cat("Number of Columns:", cols, "\n")
## Number of Columns: 4
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
tables <- html_nodes(webpage, "table")
cast_table <- html_table(tables[[3]], fill = TRUE)
str(cast_table)
## tibble [3,152 × 4] (S3: tbl_df/tbl/data.frame)
## $ X1: logi [1:3152] NA NA NA NA NA NA ...
## $ X2: chr [1:3152] "" "Angela Bassett" "" "Peter Krause" ...
## $ X3: chr [1:3152] "" "..." "" "..." ...
## $ X4: chr [1:3152] "" "Athena Grant\n / ... \n 115 episodes, 2018-2025" "" "Bobby Nash\n 115 episodes, 2018-2025" ...
cleaned_cast <- cast_table[, c(2, 4)]
cleaned_cast <- subset(cleaned_cast, cleaned_cast[[1]] != "" & cleaned_cast[[2]] != "")
cleaned_cast <- cleaned_cast[apply(cleaned_cast, 1, function(row) all(row != "")), ]
head(cleaned_cast)
## # A tibble: 6 × 2
## X2 X4
## <chr> <chr>
## 1 Angela Bassett "Athena Grant\n / ... \n 115 e…
## 2 Peter Krause "Bobby Nash\n 115 episodes, 2018-2025"
## 3 Oliver Stark "Evan 'Buck' Buckley\n 115 episodes, 20…
## 4 Aisha Hinds "Henrietta 'Hen' Wilson\n 115 episodes,…
## 5 Kenneth Choi "Howie 'Chimney' Han\n 115 episodes, 20…
## 6 Jennifer Love Hewitt "Maddie Kendall\n / ... \n 105…
tail(cleaned_cast)
## # A tibble: 6 × 2
## X2 X4
## <chr> <chr>
## 1 Aly Fabrizio "Trick or Treater\n \n \n (uncredited)\n \n …
## 2 Buffy Milner "Volleyball Player\n \n \n (uncredited)\n \n …
## 3 Ithaka Darin Pappas "Migrant\n \n \n (uncredited)\n \n 1…
## 4 Bryce Schmidt "Police Bugler\n \n \n (uncredited)\n \n …
## 5 Timothy T Tyler "Patient\n \n \n (uncredited)\n \n 1…
## 6 Jeffrey Viner "Car\n / ... \n \n \n (uncredited)\n \n …
rows <- nrow(cleaned_cast)
cols <- ncol(cleaned_cast)
cat("Final Number of Rows:", rows, "\n")
## Final Number of Rows: 1575
cat("Final Number of Columns:", cols, "\n")
## Final Number of Columns: 2
names(cast_table) <- c("v1","v2")
## Warning: The `value` argument of `names<-()` must have the same length as `x` as of
## tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `value` argument of `names<-()` can't be empty as of tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
colnames(cast_table) <- c("v1","v2")
library(rvest)
library(dplyr)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
tables <- html_nodes(webpage, ".simpleTable.simpleCreditsTable")
visual_effects_table <- NULL
for (table in tables) {
heading <- html_node(table, xpath = "preceding-sibling::h4[1]") %>% html_text()
if (grepl("Series Visual Effects", heading)) {
visual_effects_table <- html_table(table, fill = TRUE)
break
}
}
if (!is.null(visual_effects_table)) {
ve_staff <- visual_effects_table[[1]]
ve_staff <- ve_staff[ve_staff != "" & !is.na(ve_staff)]
ve_staff <- unique(ve_staff)
staff_count <- length(ve_staff)
cat("Total Series Visual Effects Staff:", staff_count, "\n")
} else {
cat("Series Visual Effects table not found.\n")
}
## Total Series Visual Effects Staff: 196