pacman::p_load(robotstxt, rvest)
paths_allowed("https://www.basketball-reference.com/boxscores/?month=6&day=14&year=2024")
##  www.basketball-reference.com
## [1] TRUE
## www.basketball-reference.com
## [1] TRUE
bas_html <- read_html("https://www.basketball-reference.com/boxscores/?month=6&day=14&year=2024")
bas_html
## {html_document}
## <html data-version="klecko-" data-root="/home/bbr/deploy/www" lang="en" class="no-js">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="bbr">\n<div id="wrap">\n  \n  <div id="header" role="banner" ...
table_html <- html_elements(bas_html, "table")
table_html
## {xml_nodeset (5)}
## [1] <table class="teams"><tbody>\n<tr class="loser">\n<td><a href="/teams/BOS ...
## [2] <table>\n<thead><tr>\n<th></th>\n<!-- the team labels -->\n\t\t\t<th>1</t ...
## [3] <table class="stats"><tbody>\n<tr>\n<td><strong>PTS</strong></td>\n\t\t\t ...
## [4] <table class="suppress_all sortable stats_table" id="confs_standings_E" d ...
## [5] <table class="suppress_all sortable stats_table" id="confs_standings_W" d ...
tibble_list <- html_table(table_html[4])
# Extract the content from the list
eastern_tibble <- tibble_list[[1]]
eastern_tibble
## # A tibble: 15 × 7
##    `Eastern Conference`     W     L `W/L%` GB    `PS/G` `PA/G`
##    <chr>                <int> <int>  <dbl> <chr>  <dbl>  <dbl>
##  1 Boston Celtics*         64    18  0.78  —       121.   109.
##  2 New York Knicks*        50    32  0.61  14.0    113.   108.
##  3 Milwaukee Bucks*        49    33  0.598 15.0    119    116.
##  4 Cleveland Cavaliers*    48    34  0.585 16.0    113.   110.
##  5 Orlando Magic*          47    35  0.573 17.0    110.   108.
##  6 Indiana Pacers*         47    35  0.573 17.0    123.   120.
##  7 Philadelphia 76ers*     47    35  0.573 17.0    115.   112.
##  8 Miami Heat*             46    36  0.561 18.0    110.   108.
##  9 Chicago Bulls           39    43  0.476 25.0    112.   114.
## 10 Atlanta Hawks           36    46  0.439 28.0    118.   120.
## 11 Brooklyn Nets           32    50  0.39  32.0    110.   113.
## 12 Toronto Raptors         25    57  0.305 39.0    112.   119.
## 13 Charlotte Hornets       21    61  0.256 43.0    107.   117.
## 14 Washington Wizards      15    67  0.183 49.0    114.   123 
## 15 Detroit Pistons         14    68  0.171 50.0    110.   119
tibble_list <- html_table(table_html[5])
# Extract the content from the list
western_tibble <- tibble_list[[1]]
western_tibble
## # A tibble: 15 × 7
##    `Western Conference`        W     L `W/L%` GB    `PS/G` `PA/G`
##    <chr>                   <int> <int>  <dbl> <chr>  <dbl>  <dbl>
##  1 Oklahoma City Thunder*     57    25  0.695 —       120.   113.
##  2 Denver Nuggets*            57    25  0.695 —       115.   110.
##  3 Minnesota Timberwolves*    56    26  0.683 1.0     113    106.
##  4 Los Angeles Clippers*      51    31  0.622 6.0     116.   112.
##  5 Dallas Mavericks*          50    32  0.61  7.0     118.   116.
##  6 New Orleans Pelicans*      49    33  0.598 8.0     115.   111.
##  7 Phoenix Suns*              49    33  0.598 8.0     116.   113.
##  8 Los Angeles Lakers*        47    35  0.573 10.0    118    117.
##  9 Sacramento Kings           46    36  0.561 11.0    117.   115.
## 10 Golden State Warriors      46    36  0.561 11.0    118.   115.
## 11 Houston Rockets            41    41  0.5   16.0    114.   113.
## 12 Utah Jazz                  31    51  0.378 26.0    116.   120.
## 13 Memphis Grizzlies          27    55  0.329 30.0    106.   113.
## 14 San Antonio Spurs          22    60  0.268 35.0    112.   119.
## 15 Portland Trail Blazers     21    61  0.256 36.0    106.   115.
names(eastern_tibble) <- c("Teams", "Wins","Losses", "Win/Loss%",
                           "Games behind", 
                           "Pts per game", "Opp pts per game")
names(western_tibble) <- c("Teams", "Wins","Losses", "Win/Loss%",
                           "Games behind", 
                           "Pts per game", "Opp pts per game")
eastern_tibble
## # A tibble: 15 × 7
##    Teams                 Wins Losses `Win/Loss%` `Games behind` `Pts per game`
##    <chr>                <int>  <int>       <dbl> <chr>                   <dbl>
##  1 Boston Celtics*         64     18       0.78  —                        121.
##  2 New York Knicks*        50     32       0.61  14.0                     113.
##  3 Milwaukee Bucks*        49     33       0.598 15.0                     119 
##  4 Cleveland Cavaliers*    48     34       0.585 16.0                     113.
##  5 Orlando Magic*          47     35       0.573 17.0                     110.
##  6 Indiana Pacers*         47     35       0.573 17.0                     123.
##  7 Philadelphia 76ers*     47     35       0.573 17.0                     115.
##  8 Miami Heat*             46     36       0.561 18.0                     110.
##  9 Chicago Bulls           39     43       0.476 25.0                     112.
## 10 Atlanta Hawks           36     46       0.439 28.0                     118.
## 11 Brooklyn Nets           32     50       0.39  32.0                     110.
## 12 Toronto Raptors         25     57       0.305 39.0                     112.
## 13 Charlotte Hornets       21     61       0.256 43.0                     107.
## 14 Washington Wizards      15     67       0.183 49.0                     114.
## 15 Detroit Pistons         14     68       0.171 50.0                     110.
## # ℹ 1 more variable: `Opp pts per game` <dbl>
wiki_html <- read_html("https://en.wikipedia.org/wiki/Player_efficiency_rating")

per_html <- html_element(wiki_html,"#mw-content-text > div.mw-content-ltr.mw-parser-output > table.wikitable.sortable")

per <- html_table(per_html)
per
## # A tibble: 25 × 5
##     Rank Player                 Pos   `Team(s) played for (years)[a]`        PER
##    <int> <chr>                  <chr> <chr>                                <dbl>
##  1     1 Nikola Jokić^          C     Denver Nuggets (2015–present)         28.5
##  2     2 Michael Jordan*        SG    Chicago Bulls (1984–1993, 1995–1998…  27.9
##  3     3 LeBron James^          SF    Cleveland Cavaliers (2003–2010, 201…  27.0
##  4     4 Anthony Davis^         PF/C  New Orleans Hornets/Pelicans (2012–…  26.9
##  5     5 Shaquille O'Neal*      C     Orlando Magic (1992–1996)Los Angele…  26.4
##  6     6 David Robinson*        C     San Antonio Spurs (1989–2003)         26.2
##  7     7 Wilt Chamberlain*      C     Philadelphia/San Francisco Warriors…  26.2
##  8     8 Giannis Antetokounmpo^ PF    Milwaukee Bucks (2013–present)        25.6
##  9     9 Bob Pettit*            PF/C  Milwaukee/St. Louis Hawks (1954–196…  25.4
## 10    10 Kevin Durant^          SF/PF Seattle SuperSonics/Oklahoma City T…  24.9
## # ℹ 15 more rows
library(robotstxt)

Define the URL to check

url <- "https://www.imdb.com/title/tt7235466/fullcredits"

Check if the path is allowed to be scraped

is_allowed <- paths_allowed(url)
##  www.imdb.com

Get all tables on the page

tables <- html_nodes(webpage, "table")

Extract the 3rd table (Series Cast)

series_cast_table <- html_table(tables[3], fill = TRUE)

View the number of rows and columns

dim(series_cast_table)
## NULL
library(rvest)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

URL of the IMDb full cast page

url <- "https://www.imdb.com/title/tt7235466/fullcredits?ref_=tt_cl_sm"

Read the HTML content

webpage <- read_html(url)

Extract tables from the page

tables <- html_nodes(webpage, "table")

Select the 3rd table (Series Cast)

cast_table <- html_table(tables[[3]], fill = TRUE)

Clean the data

clean_cast <- cast_table %>%
  select(2, 4) %>%  # Keep only the 2nd and 4th columns 
  filter(!is.na(X2) & X2 != "", !is.na(X4) & X4 != "")  # Remove blank rows

Clean

clean_cast <- clean_cast %>% filter(X2 != "" & X4 != "")

Final count

num_rows <- nrow(clean_cast)
num_columns <- ncol(clean_cast)
cat("Final cleaned dataset contains:", num_rows, "observations and", num_columns, "columns.\n")
## Final cleaned dataset contains: 1575 observations and 2 columns.

Cleaned dataset

head(clean_cast)
## # A tibble: 6 × 2
##   X2                   X4                                                       
##   <chr>                <chr>                                                    
## 1 Angela Bassett       "Athena Grant\n         / ...  \n                  115 e…
## 2 Peter Krause         "Bobby Nash\n                  115 episodes, 2018-2025"  
## 3 Oliver Stark         "Evan 'Buck' Buckley\n                  115 episodes, 20…
## 4 Aisha Hinds          "Henrietta 'Hen' Wilson\n                  115 episodes,…
## 5 Kenneth Choi         "Howie 'Chimney' Han\n                  115 episodes, 20…
## 6 Jennifer Love Hewitt "Maddie Kendall\n         / ...  \n                  105…
raw_visual <-html_element(webpage,"#fullcredits_content > table:nth-child(38)")
head(raw_visual)
## $node
## <pointer: 0x1120de980>
## 
## $doc
## <pointer: 0x10dd04720>
parse_visual <- html_table(raw_visual)
head(parse_visual)
## # A tibble: 6 × 3
##   X1                     X2    X3                                               
##   <chr>                  <chr> <chr>                                            
## 1 Christian Zeiler       ...   digital compositor / digital compositor: FuseFX …
## 2 Katrina Duclos         ...   visual effects editor / visual effects editor: F…
## 3 Bryant Reif            ...   cg supervisor (50 episodes, 2019-2022)           
## 4 Tony Pirzadeh          ...   visual effects producer: FuseFX / visual effects…
## 5 Ezra Christian         ...   managing producer (46 episodes, 2021-2024)       
## 6 Timothy Michael Cairns ...   compositing supervisor (44 episodes, 2019-2022)