# install.packages("robotstxt", repos = "https://cran.r-project.org")

Assignment 3 Question 4

library(rvest)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
url <- "https://www.basketball-reference.com/boxscores/?month=02&day=14&year=2025"
webpage <- read_html(url)

table_node <- html_element(webpage, "table")

bball_table <- html_table(table_node, fill = TRUE)

print(bball_table)
## # A tibble: 15 × 7
##    `Eastern Conference`     W     L `W/L%` GB    `PS/G` `PA/G`
##    <chr>                <int> <int>  <dbl> <chr>  <dbl>  <dbl>
##  1 Cleveland Cavaliers     44    10  0.815 —       123.   112.
##  2 Boston Celtics          39    16  0.709 5.5     117.   108.
##  3 New York Knicks         36    18  0.667 8.0     118.   112.
##  4 Indiana Pacers          30    23  0.566 13.5    116    116.
##  5 Milwaukee Bucks         29    24  0.547 14.5    114.   113.
##  6 Detroit Pistons         29    26  0.527 15.5    114.   113.
##  7 Orlando Magic           27    29  0.482 18.0    104    105.
##  8 Atlanta Hawks           26    29  0.473 18.5    117.   119.
##  9 Miami Heat              25    28  0.472 18.5    110.   111.
## 10 Chicago Bulls           22    33  0.4   22.5    116    121.
## 11 Brooklyn Nets           20    34  0.37  24.0    105    111 
## 12 Philadelphia 76ers      20    34  0.37  24.0    109.   113.
## 13 Toronto Raptors         17    38  0.309 27.5    111.   116.
## 14 Charlotte Hornets       13    39  0.25  30.0    106.   112.
## 15 Washington Wizards       9    45  0.167 35.0    109.   122

Assignment 3 Question 5

# install.packages("robotstxt")

library(robotstxt)

url_to_check <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
domain <- "https://www.imdb.com/"
is_allowed <- paths_allowed(paths = url_to_check, domain = domain)
##  https://www.imdb.com/
if (is_allowed) {
print("Scraping is allowed: TRUE")
} else {
print("Scraping is disallowed: FALSE")
}
## [1] "Scraping is allowed: TRUE"

Question 6

library(rvest)
library(dplyr)
url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"


webpage <- read_html(url)

tables <- html_elements(webpage, "table")


series_cast_table <- html_table(tables[[3]], fill = TRUE)

dim(series_cast_table)
## [1] 3152    4

Question 7

library(rvest)
library(dplyr)

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"

webpage <- read_html(url)

tables <- html_elements(webpage, "table")

series_cast_table <- html_table(tables[[3]], fill = TRUE)

head(series_cast_table)
## # A tibble: 6 × 4
##   X1    X2               X3    X4                                               
##   <lgl> <chr>            <chr> <chr>                                            
## 1 NA    ""               ""    ""                                               
## 2 NA    "Angela Bassett" "..." "Athena Grant\n         / ...  \n               …
## 3 NA    ""               ""    ""                                               
## 4 NA    "Peter Krause"   "..." "Bobby Nash\n                  115 episodes, 201…
## 5 NA    ""               ""    ""                                               
## 6 NA    "Oliver Stark"   "..." "Evan 'Buck' Buckley\n                  115 epis…
series_cast_cleaned <- series_cast_table[, c(2, 4)]

series_cast_cleaned <- subset(series_cast_cleaned, series_cast_cleaned[,1] != "" & series_cast_cleaned[,2] != "")

tail(series_cast_cleaned)
## # A tibble: 6 × 2
##   X2                  X4                                                        
##   <chr>               <chr>                                                     
## 1 Aly Fabrizio        "Trick or Treater\n  \n  \n  (uncredited)\n  \n          …
## 2 Buffy Milner        "Volleyball Player\n  \n  \n  (uncredited)\n  \n         …
## 3 Ithaka Darin Pappas "Migrant\n  \n  \n  (uncredited)\n  \n                  1…
## 4 Bryce Schmidt       "Police Bugler\n  \n  \n  (uncredited)\n  \n             …
## 5 Timothy T Tyler     "Patient\n  \n  \n  (uncredited)\n  \n                  1…
## 6 Jeffrey Viner       "Car\n         / ...  \n  \n  \n  (uncredited)\n  \n     …
series_cast_cleaned <- series_cast_cleaned[rowSums(series_cast_cleaned == "") == 0, ]

dim(series_cast_cleaned)
## [1] 1575    2

Question 8

names(bball_table) <- c("v1","v2")
## Warning: The `value` argument of `names<-()` must have the same length as `x` as of
## tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `value` argument of `names<-()` can't be empty as of tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
colnames(bball_table) <- c("v1","v2")

Question 9

library(rvest)
library(dplyr)

## install.packages("rvest")

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm" 

page <- read_html(url)

table <- page %>%
  html_nodes("#fullcredits_content > table:nth-child(38)") %>%  
  html_table(fill = TRUE)

staff_table <- table[[1]]
head(staff_table)
## # A tibble: 6 × 3
##   X1                     X2    X3                                               
##   <chr>                  <chr> <chr>                                            
## 1 Christian Zeiler       ...   digital compositor / digital compositor: FuseFX …
## 2 Katrina Duclos         ...   visual effects editor / visual effects editor: F…
## 3 Bryant Reif            ...   cg supervisor (50 episodes, 2019-2022)           
## 4 Tony Pirzadeh          ...   visual effects producer: FuseFX / visual effects…
## 5 Ezra Christian         ...   managing producer (46 episodes, 2021-2024)       
## 6 Timothy Michael Cairns ...   compositing supervisor (44 episodes, 2019-2022)
staff_count <- nrow(staff_table) - 1

print(paste("Number of staff for Series Visual Effects:", staff_count))
## [1] "Number of staff for Series Visual Effects: 195"