Homework 3

library(rvest)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

url <- "https://www.basketball-reference.com/boxscores/?month=02&day=14&year=2025"
webpage <- read_html(url)

table_node <- html_element(webpage, "table")

bball_table <- html_table(table_node, fill = TRUE)

print(bball_table)

## # A tibble: 15 × 7
##    `Eastern Conference`     W     L `W/L%` GB    `PS/G` `PA/G`
##    <chr>                <int> <int>  <dbl> <chr>  <dbl>  <dbl>
##  1 Cleveland Cavaliers     44    10  0.815 —       123.   112.
##  2 Boston Celtics          39    16  0.709 5.5     117.   108.
##  3 New York Knicks         36    18  0.667 8.0     118.   112.
##  4 Indiana Pacers          30    23  0.566 13.5    116    116.
##  5 Milwaukee Bucks         29    24  0.547 14.5    114.   113.
##  6 Detroit Pistons         29    26  0.527 15.5    114.   113.
##  7 Orlando Magic           27    29  0.482 18.0    104    105.
##  8 Atlanta Hawks           26    29  0.473 18.5    117.   119.
##  9 Miami Heat              25    28  0.472 18.5    110.   111.
## 10 Chicago Bulls           22    33  0.4   22.5    116    121.
## 11 Brooklyn Nets           20    34  0.37  24.0    105    111 
## 12 Philadelphia 76ers      20    34  0.37  24.0    109.   113.
## 13 Toronto Raptors         17    38  0.309 27.5    111.   116.
## 14 Charlotte Hornets       13    39  0.25  30.0    106.   112.
## 15 Washington Wizards       9    45  0.167 35.0    109.   122

install.packages("robotstxt", repos = "https://cran.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/x5/86kchw4505n33hzsz9x480jh0000gn/T//RtmpAWkfSg/downloaded_packages

library(robotstxt)

## Warning: package 'robotstxt' was built under R version 4.3.3

url_to_check <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
domain <- "https://www.imdb.com/"
is_allowed <- paths_allowed(paths = url_to_check, domain = domain)

## 
 https://www.imdb.com/

if (is_allowed) {
  print("Scraping is allowed: TRUE")
} else {
  print("Scraping is disallowed: FALSE")
}

## [1] "Scraping is allowed: TRUE"

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
tables <- html_nodes(webpage, "table")
cast_table <- html_table(tables[[3]], fill = TRUE)
str(cast_table)

## tibble [3,152 × 4] (S3: tbl_df/tbl/data.frame)
##  $ X1: logi [1:3152] NA NA NA NA NA NA ...
##  $ X2: chr [1:3152] "" "Angela Bassett" "" "Peter Krause" ...
##  $ X3: chr [1:3152] "" "..." "" "..." ...
##  $ X4: chr [1:3152] "" "Athena Grant\n         / ...  \n                  115 episodes, 2018-2025" "" "Bobby Nash\n                  115 episodes, 2018-2025" ...

head(cast_table)

## # A tibble: 6 × 4
##   X1    X2               X3    X4                                               
##   <lgl> <chr>            <chr> <chr>                                            
## 1 NA    ""               ""    ""                                               
## 2 NA    "Angela Bassett" "..." "Athena Grant\n         / ...  \n               …
## 3 NA    ""               ""    ""                                               
## 4 NA    "Peter Krause"   "..." "Bobby Nash\n                  115 episodes, 201…
## 5 NA    ""               ""    ""                                               
## 6 NA    "Oliver Stark"   "..." "Evan 'Buck' Buckley\n                  115 epis…

rows <- nrow(cast_table)
cols <- ncol(cast_table)
cat("Number of Rows:", rows, "\n")

## Number of Rows: 3152

cat("Number of Columns:", cols, "\n")

## Number of Columns: 4

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)
tables <- html_nodes(webpage, "table")
cast_table <- html_table(tables[[3]], fill = TRUE)
str(cast_table)

## tibble [3,152 × 4] (S3: tbl_df/tbl/data.frame)
##  $ X1: logi [1:3152] NA NA NA NA NA NA ...
##  $ X2: chr [1:3152] "" "Angela Bassett" "" "Peter Krause" ...
##  $ X3: chr [1:3152] "" "..." "" "..." ...
##  $ X4: chr [1:3152] "" "Athena Grant\n         / ...  \n                  115 episodes, 2018-2025" "" "Bobby Nash\n                  115 episodes, 2018-2025" ...

cleaned_cast <- cast_table[, c(2, 4)]
cleaned_cast <- subset(cleaned_cast, cleaned_cast[[1]] != "" & cleaned_cast[[2]] != "")
cleaned_cast <- cleaned_cast[apply(cleaned_cast, 1, function(row) all(row != "")), ]
head(cleaned_cast)

## # A tibble: 6 × 2
##   X2                   X4                                                       
##   <chr>                <chr>                                                    
## 1 Angela Bassett       "Athena Grant\n         / ...  \n                  115 e…
## 2 Peter Krause         "Bobby Nash\n                  115 episodes, 2018-2025"  
## 3 Oliver Stark         "Evan 'Buck' Buckley\n                  115 episodes, 20…
## 4 Aisha Hinds          "Henrietta 'Hen' Wilson\n                  115 episodes,…
## 5 Kenneth Choi         "Howie 'Chimney' Han\n                  115 episodes, 20…
## 6 Jennifer Love Hewitt "Maddie Kendall\n         / ...  \n                  105…

tail(cleaned_cast)

## # A tibble: 6 × 2
##   X2                  X4                                                        
##   <chr>               <chr>                                                     
## 1 Aly Fabrizio        "Trick or Treater\n  \n  \n  (uncredited)\n  \n          …
## 2 Buffy Milner        "Volleyball Player\n  \n  \n  (uncredited)\n  \n         …
## 3 Ithaka Darin Pappas "Migrant\n  \n  \n  (uncredited)\n  \n                  1…
## 4 Bryce Schmidt       "Police Bugler\n  \n  \n  (uncredited)\n  \n             …
## 5 Timothy T Tyler     "Patient\n  \n  \n  (uncredited)\n  \n                  1…
## 6 Jeffrey Viner       "Car\n         / ...  \n  \n  \n  (uncredited)\n  \n     …

rows <- nrow(cleaned_cast)
cols <- ncol(cleaned_cast)
cat("Final Number of Rows:", rows, "\n")

## Final Number of Rows: 1575

cat("Final Number of Columns:", cols, "\n")

## Final Number of Columns: 2

names(cast_table) <- c("v1","v2")

## Warning: The `value` argument of `names<-()` must have the same length as `x` as of
## tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `value` argument of `names<-()` can't be empty as of tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

colnames(cast_table) <- c("v1","v2")

library(rvest)

library(dplyr)


url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"


webpage <- read_html(url)


tables <- html_nodes(webpage, ".simpleTable.simpleCreditsTable")


visual_effects_table <- NULL

for (table in tables) {
  heading <- html_node(table, xpath = "preceding-sibling::h4[1]") %>% html_text()
  if (grepl("Series Visual Effects", heading)) {
    visual_effects_table <- html_table(table, fill = TRUE)
    break
  }
}


if (!is.null(visual_effects_table)) {

  ve_staff <- visual_effects_table[[1]]

  ve_staff <- ve_staff[ve_staff != "" & !is.na(ve_staff)]
  

  ve_staff <- unique(ve_staff)
  

  staff_count <- length(ve_staff)
  

  cat("Total Series Visual Effects Staff:", staff_count, "\n")
} else {
  cat("Series Visual Effects table not found.\n")
}

## Total Series Visual Effects Staff: 196

Homework 3

Melissa Conti

2025-02-21