# install.packages("robotstxt", repos = "https://cran.r-project.org")
Assignment 3 Question 4
library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url <- "https://www.basketball-reference.com/boxscores/?month=02&day=14&year=2025"
webpage <- read_html(url)
table_node <- html_element(webpage, "table")
bball_table <- html_table(table_node, fill = TRUE)
print(bball_table)
## # A tibble: 15 × 7
## `Eastern Conference` W L `W/L%` GB `PS/G` `PA/G`
## <chr> <int> <int> <dbl> <chr> <dbl> <dbl>
## 1 Cleveland Cavaliers 44 10 0.815 — 123. 112.
## 2 Boston Celtics 39 16 0.709 5.5 117. 108.
## 3 New York Knicks 36 18 0.667 8.0 118. 112.
## 4 Indiana Pacers 30 23 0.566 13.5 116 116.
## 5 Milwaukee Bucks 29 24 0.547 14.5 114. 113.
## 6 Detroit Pistons 29 26 0.527 15.5 114. 113.
## 7 Orlando Magic 27 29 0.482 18.0 104 105.
## 8 Atlanta Hawks 26 29 0.473 18.5 117. 119.
## 9 Miami Heat 25 28 0.472 18.5 110. 111.
## 10 Chicago Bulls 22 33 0.4 22.5 116 121.
## 11 Brooklyn Nets 20 34 0.37 24.0 105 111
## 12 Philadelphia 76ers 20 34 0.37 24.0 109. 113.
## 13 Toronto Raptors 17 38 0.309 27.5 111. 116.
## 14 Charlotte Hornets 13 39 0.25 30.0 106. 112.
## 15 Washington Wizards 9 45 0.167 35.0 109. 122
Assignment 3 Question 5
# install.packages("robotstxt")
library(robotstxt)
url_to_check <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
domain <- "https://www.imdb.com/"
is_allowed <- paths_allowed(paths = url_to_check, domain = domain)
## https://www.imdb.com/
if (is_allowed) {
print("Scraping is allowed: TRUE")
} else {
print("Scraping is disallowed: FALSE")
}
## [1] "Scraping is allowed: TRUE"
Question 6
library(rvest)
library(dplyr)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
tables <- html_elements(webpage, "table")
series_cast_table <- html_table(tables[[3]], fill = TRUE)
dim(series_cast_table)
## [1] 3152 4
Question 7
library(rvest)
library(dplyr)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
tables <- html_elements(webpage, "table")
series_cast_table <- html_table(tables[[3]], fill = TRUE)
head(series_cast_table)
## # A tibble: 6 × 4
## X1 X2 X3 X4
## <lgl> <chr> <chr> <chr>
## 1 NA "" "" ""
## 2 NA "Angela Bassett" "..." "Athena Grant\n / ... \n …
## 3 NA "" "" ""
## 4 NA "Peter Krause" "..." "Bobby Nash\n 115 episodes, 201…
## 5 NA "" "" ""
## 6 NA "Oliver Stark" "..." "Evan 'Buck' Buckley\n 115 epis…
series_cast_cleaned <- series_cast_table[, c(2, 4)]
series_cast_cleaned <- subset(series_cast_cleaned, series_cast_cleaned[,1] != "" & series_cast_cleaned[,2] != "")
tail(series_cast_cleaned)
## # A tibble: 6 × 2
## X2 X4
## <chr> <chr>
## 1 Aly Fabrizio "Trick or Treater\n \n \n (uncredited)\n \n …
## 2 Buffy Milner "Volleyball Player\n \n \n (uncredited)\n \n …
## 3 Ithaka Darin Pappas "Migrant\n \n \n (uncredited)\n \n 1…
## 4 Bryce Schmidt "Police Bugler\n \n \n (uncredited)\n \n …
## 5 Timothy T Tyler "Patient\n \n \n (uncredited)\n \n 1…
## 6 Jeffrey Viner "Car\n / ... \n \n \n (uncredited)\n \n …
series_cast_cleaned <- series_cast_cleaned[rowSums(series_cast_cleaned == "") == 0, ]
dim(series_cast_cleaned)
## [1] 1575 2
Question 8
names(bball_table) <- c("v1","v2")
## Warning: The `value` argument of `names<-()` must have the same length as `x` as of
## tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `value` argument of `names<-()` can't be empty as of tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
colnames(bball_table) <- c("v1","v2")
Question 9
library(rvest)
library(dplyr)
## install.packages("rvest")
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
page <- read_html(url)
table <- page %>%
html_nodes("#fullcredits_content > table:nth-child(38)") %>%
html_table(fill = TRUE)
staff_table <- table[[1]]
head(staff_table)
## # A tibble: 6 × 3
## X1 X2 X3
## <chr> <chr> <chr>
## 1 Christian Zeiler ... digital compositor / digital compositor: FuseFX …
## 2 Katrina Duclos ... visual effects editor / visual effects editor: F…
## 3 Bryant Reif ... cg supervisor (50 episodes, 2019-2022)
## 4 Tony Pirzadeh ... visual effects producer: FuseFX / visual effects…
## 5 Ezra Christian ... managing producer (46 episodes, 2021-2024)
## 6 Timothy Michael Cairns ... compositing supervisor (44 episodes, 2019-2022)
staff_count <- nrow(staff_table) - 1
print(paste("Number of staff for Series Visual Effects:", staff_count))
## [1] "Number of staff for Series Visual Effects: 195"