pacman::p_load(robotstxt, rvest)

paths_allowed("https://www.basketball-reference.com/boxscores/?month=6&day=14&year=2024")

##  www.basketball-reference.com

## [1] TRUE

## www.basketball-reference.com

## [1] TRUE

bas_html <- read_html("https://www.basketball-reference.com/boxscores/?month=6&day=14&year=2024")
bas_html

## {html_document}
## <html data-version="klecko-" data-root="/home/bbr/deploy/www" lang="en" class="no-js">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="bbr">\n<div id="wrap">\n  \n  <div id="header" role="banner" ...

table_html <- html_elements(bas_html, "table")
table_html

## {xml_nodeset (5)}
## [1] <table class="teams"><tbody>\n<tr class="loser">\n<td><a href="/teams/BOS ...
## [2] <table>\n<thead><tr>\n<th></th>\n<!-- the team labels -->\n\t\t\t<th>1</t ...
## [3] <table class="stats"><tbody>\n<tr>\n<td><strong>PTS</strong></td>\n\t\t\t ...
## [4] <table class="suppress_all sortable stats_table" id="confs_standings_E" d ...
## [5] <table class="suppress_all sortable stats_table" id="confs_standings_W" d ...

tibble_list <- html_table(table_html[4])
# Extract the content from the list
eastern_tibble <- tibble_list[[1]]
eastern_tibble

## # A tibble: 15 × 7
##    `Eastern Conference`     W     L `W/L%` GB    `PS/G` `PA/G`
##    <chr>                <int> <int>  <dbl> <chr>  <dbl>  <dbl>
##  1 Boston Celtics*         64    18  0.78  —       121.   109.
##  2 New York Knicks*        50    32  0.61  14.0    113.   108.
##  3 Milwaukee Bucks*        49    33  0.598 15.0    119    116.
##  4 Cleveland Cavaliers*    48    34  0.585 16.0    113.   110.
##  5 Orlando Magic*          47    35  0.573 17.0    110.   108.
##  6 Indiana Pacers*         47    35  0.573 17.0    123.   120.
##  7 Philadelphia 76ers*     47    35  0.573 17.0    115.   112.
##  8 Miami Heat*             46    36  0.561 18.0    110.   108.
##  9 Chicago Bulls           39    43  0.476 25.0    112.   114.
## 10 Atlanta Hawks           36    46  0.439 28.0    118.   120.
## 11 Brooklyn Nets           32    50  0.39  32.0    110.   113.
## 12 Toronto Raptors         25    57  0.305 39.0    112.   119.
## 13 Charlotte Hornets       21    61  0.256 43.0    107.   117.
## 14 Washington Wizards      15    67  0.183 49.0    114.   123 
## 15 Detroit Pistons         14    68  0.171 50.0    110.   119

tibble_list <- html_table(table_html[5])
# Extract the content from the list
western_tibble <- tibble_list[[1]]
western_tibble

## # A tibble: 15 × 7
##    `Western Conference`        W     L `W/L%` GB    `PS/G` `PA/G`
##    <chr>                   <int> <int>  <dbl> <chr>  <dbl>  <dbl>
##  1 Oklahoma City Thunder*     57    25  0.695 —       120.   113.
##  2 Denver Nuggets*            57    25  0.695 —       115.   110.
##  3 Minnesota Timberwolves*    56    26  0.683 1.0     113    106.
##  4 Los Angeles Clippers*      51    31  0.622 6.0     116.   112.
##  5 Dallas Mavericks*          50    32  0.61  7.0     118.   116.
##  6 New Orleans Pelicans*      49    33  0.598 8.0     115.   111.
##  7 Phoenix Suns*              49    33  0.598 8.0     116.   113.
##  8 Los Angeles Lakers*        47    35  0.573 10.0    118    117.
##  9 Sacramento Kings           46    36  0.561 11.0    117.   115.
## 10 Golden State Warriors      46    36  0.561 11.0    118.   115.
## 11 Houston Rockets            41    41  0.5   16.0    114.   113.
## 12 Utah Jazz                  31    51  0.378 26.0    116.   120.
## 13 Memphis Grizzlies          27    55  0.329 30.0    106.   113.
## 14 San Antonio Spurs          22    60  0.268 35.0    112.   119.
## 15 Portland Trail Blazers     21    61  0.256 36.0    106.   115.

names(eastern_tibble) <- c("Teams", "Wins","Losses", "Win/Loss%",
                           "Games behind", 
                           "Pts per game", "Opp pts per game")
names(western_tibble) <- c("Teams", "Wins","Losses", "Win/Loss%",
                           "Games behind", 
                           "Pts per game", "Opp pts per game")
eastern_tibble

## # A tibble: 15 × 7
##    Teams                 Wins Losses `Win/Loss%` `Games behind` `Pts per game`
##    <chr>                <int>  <int>       <dbl> <chr>                   <dbl>
##  1 Boston Celtics*         64     18       0.78  —                        121.
##  2 New York Knicks*        50     32       0.61  14.0                     113.
##  3 Milwaukee Bucks*        49     33       0.598 15.0                     119 
##  4 Cleveland Cavaliers*    48     34       0.585 16.0                     113.
##  5 Orlando Magic*          47     35       0.573 17.0                     110.
##  6 Indiana Pacers*         47     35       0.573 17.0                     123.
##  7 Philadelphia 76ers*     47     35       0.573 17.0                     115.
##  8 Miami Heat*             46     36       0.561 18.0                     110.
##  9 Chicago Bulls           39     43       0.476 25.0                     112.
## 10 Atlanta Hawks           36     46       0.439 28.0                     118.
## 11 Brooklyn Nets           32     50       0.39  32.0                     110.
## 12 Toronto Raptors         25     57       0.305 39.0                     112.
## 13 Charlotte Hornets       21     61       0.256 43.0                     107.
## 14 Washington Wizards      15     67       0.183 49.0                     114.
## 15 Detroit Pistons         14     68       0.171 50.0                     110.
## # ℹ 1 more variable: `Opp pts per game` <dbl>

wiki_html <- read_html("https://en.wikipedia.org/wiki/Player_efficiency_rating")

per_html <- html_element(wiki_html,"#mw-content-text > div.mw-content-ltr.mw-parser-output > table.wikitable.sortable")

per <- html_table(per_html)
per

## # A tibble: 25 × 5
##     Rank Player                 Pos   `Team(s) played for (years)[a]`        PER
##    <int> <chr>                  <chr> <chr>                                <dbl>
##  1     1 Nikola Jokić^          C     Denver Nuggets (2015–present)         28.5
##  2     2 Michael Jordan*        SG    Chicago Bulls (1984–1993, 1995–1998…  27.9
##  3     3 LeBron James^          SF    Cleveland Cavaliers (2003–2010, 201…  27.0
##  4     4 Anthony Davis^         PF/C  New Orleans Hornets/Pelicans (2012–…  26.9
##  5     5 Shaquille O'Neal*      C     Orlando Magic (1992–1996)Los Angele…  26.4
##  6     6 David Robinson*        C     San Antonio Spurs (1989–2003)         26.2
##  7     7 Wilt Chamberlain*      C     Philadelphia/San Francisco Warriors…  26.2
##  8     8 Giannis Antetokounmpo^ PF    Milwaukee Bucks (2013–present)        25.6
##  9     9 Bob Pettit*            PF/C  Milwaukee/St. Louis Hawks (1954–196…  25.4
## 10    10 Kevin Durant^          SF/PF Seattle SuperSonics/Oklahoma City T…  24.9
## # ℹ 15 more rows

library(robotstxt)

Define the URL to check

url <- "https://www.imdb.com/title/tt7235466/fullcredits"

Check if the path is allowed to be scraped

is_allowed <- paths_allowed(url)

##  www.imdb.com

Print the result

if (is_allowed) {
  print("Scraping is allowed for this page.")
} else {
  print("Scraping is NOT allowed for this page.")
}

## [1] "Scraping is allowed for this page."

library(rvest)

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"
webpage <- read_html(url)

Get all tables on the page

tables <- html_nodes(webpage, "table")

Extract the 3rd table (Series Cast)

series_cast_table <- html_table(tables[3], fill = TRUE)

View the number of rows and columns

dim(series_cast_table)

## NULL

library(rvest)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

URL of the IMDb full cast page

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"

Read the HTML content

webpage <- read_html(url)

Extract tables from the page

tables <- html_nodes(webpage, "table")

Select the 3rd table (Series Cast)

cast_table <- html_table(tables[[3]], fill = TRUE)

Clean the data

clean_cast <- cast_table %>%
  select(2, 4) %>%  # Keep only the 2nd and 4th columns 
  filter(!is.na(X2) & X2 != "", !is.na(X4) & X4 != "")  # Remove blank rows

Clean

clean_cast <- clean_cast %>% filter(X2 != "" & X4 != "")

Final count

num_rows <- nrow(clean_cast)
num_columns <- ncol(clean_cast)
cat("Final cleaned dataset contains:", num_rows, "observations and", num_columns, "columns.\n")

## Final cleaned dataset contains: 1575 observations and 2 columns.

Cleaned dataset

head(clean_cast)

## # A tibble: 6 × 2
##   X2                   X4                                                       
##   <chr>                <chr>                                                    
## 1 Angela Bassett       "Athena Grant\n         / ...  \n                  115 e…
## 2 Peter Krause         "Bobby Nash\n                  115 episodes, 2018-2025"  
## 3 Oliver Stark         "Evan 'Buck' Buckley\n                  115 episodes, 20…
## 4 Aisha Hinds          "Henrietta 'Hen' Wilson\n                  115 episodes,…
## 5 Kenneth Choi         "Howie 'Chimney' Han\n                  115 episodes, 20…
## 6 Jennifer Love Hewitt "Maddie Kendall\n         / ...  \n                  105…

raw_visual <-html_element(webpage,"#fullcredits_content > table:nth-child(38)")
head(raw_visual)

## $node
## <pointer: 0x1120de980>
## 
## $doc
## <pointer: 0x10dd04720>

parse_visual <- html_table(raw_visual)
head(parse_visual)

## # A tibble: 6 × 3
##   X1                     X2    X3                                               
##   <chr>                  <chr> <chr>                                            
## 1 Christian Zeiler       ...   digital compositor / digital compositor: FuseFX …
## 2 Katrina Duclos         ...   visual effects editor / visual effects editor: F…
## 3 Bryant Reif            ...   cg supervisor (50 episodes, 2019-2022)           
## 4 Tony Pirzadeh          ...   visual effects producer: FuseFX / visual effects…
## 5 Ezra Christian         ...   managing producer (46 episodes, 2021-2024)       
## 6 Timothy Michael Cairns ...   compositing supervisor (44 episodes, 2019-2022)

Assignment #3

Thomas Healey

2025-02-14

Define the URL to check

Check if the path is allowed to be scraped

Print the result

Get all tables on the page

Extract the 3rd table (Series Cast)

View the number of rows and columns

URL of the IMDb full cast page

Read the HTML content

Extract tables from the page

Select the 3rd table (Series Cast)

Clean the data

Clean

Final count

Cleaned dataset